Add a constant-time generic modular inverse function.

This uses the full binary GCD algorithm, where all four of A, B, C, and
D must be retained. (BN_mod_inverse_odd implements the odd number
version which only needs A and C.) It is patterned after the version
in the Handbook of Applied Cryptography, but tweaked so the coefficients
are non-negative and bounded.

Median of 29 RSA keygens: 0m0.225s -> 0m0.220s
(Accuracy beyond 0.1s is questionable.)

Bug: 238
Change-Id: I6dc13524ea7c8ac1072592857880ddf141d87526
Reviewed-on: https://boringssl-review.googlesource.com/26370
Reviewed-by: Adam Langley <alangley@gmail.com>
diff --git a/crypto/fipsmodule/bn/bn_test.cc b/crypto/fipsmodule/bn/bn_test.cc
index 2ee6a16..99d7f16 100644
--- a/crypto/fipsmodule/bn/bn_test.cc
+++ b/crypto/fipsmodule/bn/bn_test.cc
@@ -867,6 +867,13 @@
 
   ASSERT_TRUE(BN_gcd(ret.get(), a.get(), m.get(), ctx));
   EXPECT_BIGNUMS_EQUAL("GCD(A, M)", BN_value_one(), ret.get());
+
+  ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
+  int no_inverse;
+  ASSERT_TRUE(
+      bn_mod_inverse_consttime(ret.get(), &no_inverse, a.get(), m.get(), ctx));
+  EXPECT_BIGNUMS_EQUAL("inv(A) (mod M) (constant-time)", mod_inv.get(),
+                       ret.get());
 }
 
 static void TestGCD(BIGNUMFileTest *t, BN_CTX *ctx) {
@@ -889,6 +896,28 @@
         << "A^-1 (mod B) computed, but it does not exist";
     EXPECT_FALSE(BN_mod_inverse(ret.get(), b.get(), a.get(), ctx))
         << "B^-1 (mod A) computed, but it does not exist";
+
+    if (!BN_is_zero(b.get())) {
+      bssl::UniquePtr<BIGNUM> a_reduced(BN_new());
+      ASSERT_TRUE(a_reduced);
+      ASSERT_TRUE(BN_nnmod(a_reduced.get(), a.get(), b.get(), ctx));
+      int no_inverse;
+      EXPECT_FALSE(bn_mod_inverse_consttime(ret.get(), &no_inverse,
+                                            a_reduced.get(), b.get(), ctx))
+          << "A^-1 (mod B) computed, but it does not exist";
+      EXPECT_TRUE(no_inverse);
+    }
+
+    if (!BN_is_zero(a.get())) {
+      bssl::UniquePtr<BIGNUM> b_reduced(BN_new());
+      ASSERT_TRUE(b_reduced);
+      ASSERT_TRUE(BN_nnmod(b_reduced.get(), b.get(), a.get(), ctx));
+      int no_inverse;
+      EXPECT_FALSE(bn_mod_inverse_consttime(ret.get(), &no_inverse,
+                                            b_reduced.get(), a.get(), ctx))
+          << "B^-1 (mod A) computed, but it does not exist";
+      EXPECT_TRUE(no_inverse);
+    }
   }
 
   int is_relative_prime;
diff --git a/crypto/fipsmodule/bn/gcd.c b/crypto/fipsmodule/bn/gcd.c
index bc40ec2..7868b40 100644
--- a/crypto/fipsmodule/bn/gcd.c
+++ b/crypto/fipsmodule/bn/gcd.c
@@ -123,6 +123,23 @@
   bn_select_words(a, mask, tmp, a, num);
 }
 
+static void maybe_rshift1_words_carry(BN_ULONG *a, BN_ULONG carry,
+                                      BN_ULONG mask, BN_ULONG *tmp,
+                                      size_t num) {
+  maybe_rshift1_words(a, mask, tmp, num);
+  if (num != 0) {
+    carry &= mask;
+    a[num - 1] |= carry << (BN_BITS2-1);
+  }
+}
+
+static BN_ULONG maybe_add_words(BN_ULONG *a, BN_ULONG mask, const BN_ULONG *b,
+                                BN_ULONG *tmp, size_t num) {
+  BN_ULONG carry = bn_add_words(tmp, a, b, num);
+  bn_select_words(a, mask, tmp, a, num);
+  return carry & mask;
+}
+
 static int bn_gcd_consttime(BIGNUM *r, unsigned *out_shift, const BIGNUM *x,
                             const BIGNUM *y, BN_CTX *ctx) {
   size_t width = x->width > y->width ? x->width : y->width;
@@ -243,10 +260,163 @@
   return ret;
 }
 
-// solves ax == 1 (mod n)
-static int bn_mod_inverse_general(BIGNUM *out, int *out_no_inverse,
-                                  const BIGNUM *a, const BIGNUM *n,
-                                  BN_CTX *ctx);
+int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse, const BIGNUM *a,
+                             const BIGNUM *n, BN_CTX *ctx) {
+  *out_no_inverse = 0;
+  if (BN_is_negative(a) || BN_ucmp(a, n) >= 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
+    return 0;
+  }
+  if (BN_is_zero(a)) {
+    if (BN_is_one(n)) {
+      BN_zero(r);
+      return 1;
+    }
+    *out_no_inverse = 1;
+    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
+    return 0;
+  }
+
+  // This is a constant-time implementation of the extended binary GCD
+  // algorithm. It is adapted from the Handbook of Applied Cryptography, section
+  // 14.4.3, algorithm 14.51, and modified to bound coefficients and avoid
+  // negative numbers.
+  //
+  // For more details and proof of correctness, see
+  // https://github.com/mit-plv/fiat-crypto/pull/333. In particular, see |step|
+  // and |mod_inverse_consttime| for the algorithm in Gallina and see
+  // |mod_inverse_consttime_spec| for the correctness result.
+
+  if (!BN_is_odd(a) && !BN_is_odd(n)) {
+    *out_no_inverse = 1;
+    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
+    return 0;
+  }
+
+  // This function exists to compute the RSA private exponent, where |a| is one
+  // word. We'll thus use |a_width| when available.
+  size_t n_width = n->width, a_width = a->width;
+  if (a_width > n_width) {
+    a_width = n_width;
+  }
+
+  int ret = 0;
+  BN_CTX_start(ctx);
+  BIGNUM *u = BN_CTX_get(ctx);
+  BIGNUM *v = BN_CTX_get(ctx);
+  BIGNUM *A = BN_CTX_get(ctx);
+  BIGNUM *B = BN_CTX_get(ctx);
+  BIGNUM *C = BN_CTX_get(ctx);
+  BIGNUM *D = BN_CTX_get(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  BIGNUM *tmp2 = BN_CTX_get(ctx);
+  if (u == NULL || v == NULL || A == NULL || B == NULL || C == NULL ||
+      D == NULL || tmp == NULL || tmp2 == NULL ||
+      !BN_copy(u, a) ||
+      !BN_copy(v, n) ||
+      !BN_one(A) ||
+      !BN_one(D) ||
+      // For convenience, size |u| and |v| equivalently.
+      !bn_resize_words(u, n_width) ||
+      !bn_resize_words(v, n_width) ||
+      // |A| and |C| are bounded by |m|.
+      !bn_resize_words(A, n_width) ||
+      !bn_resize_words(C, n_width) ||
+      // |B| and |D| are bounded by |a|.
+      !bn_resize_words(B, a_width) ||
+      !bn_resize_words(D, a_width) ||
+      // |tmp| and |tmp2| may be used at either size.
+      !bn_resize_words(tmp, n_width) ||
+      !bn_resize_words(tmp2, n_width)) {
+    goto err;
+  }
+
+  // Each loop iteration halves at least one of |u| and |v|. Thus we need at
+  // most the combined bit width of inputs for at least one value to be zero.
+  unsigned a_bits = a_width * BN_BITS2, n_bits = n_width * BN_BITS2;
+  unsigned num_iters = a_bits + n_bits;
+  if (num_iters < a_bits) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    goto err;
+  }
+
+  // Before and after each loop iteration, the following hold:
+  //
+  //   u = A*a - B*n
+  //   v = D*n - C*a
+  //   0 < u <= a
+  //   0 <= v <= n
+  //   0 <= A < n
+  //   0 <= B <= a
+  //   0 <= C < n
+  //   0 <= D <= a
+  //
+  // After each loop iteration, u and v only get smaller, and at least one of
+  // them shrinks by at least a factor of two.
+  for (unsigned i = 0; i < num_iters; i++) {
+    BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]);
+
+    // If both |u| and |v| are odd, subtract the smaller from the larger.
+    BN_ULONG v_less_than_u =
+        (BN_ULONG)0 - bn_sub_words(tmp->d, v->d, u->d, n_width);
+    bn_select_words(v->d, both_odd & ~v_less_than_u, tmp->d, v->d, n_width);
+    bn_sub_words(tmp->d, u->d, v->d, n_width);
+    bn_select_words(u->d, both_odd & v_less_than_u, tmp->d, u->d, n_width);
+
+    // If we updated one of the values, update the corresponding coefficient.
+    BN_ULONG carry = bn_add_words(tmp->d, A->d, C->d, n_width);
+    carry -= bn_sub_words(tmp2->d, tmp->d, n->d, n_width);
+    bn_select_words(tmp->d, carry, tmp->d, tmp2->d, n_width);
+    bn_select_words(A->d, both_odd & v_less_than_u, tmp->d, A->d, n_width);
+    bn_select_words(C->d, both_odd & ~v_less_than_u, tmp->d, C->d, n_width);
+
+    bn_add_words(tmp->d, B->d, D->d, a_width);
+    bn_sub_words(tmp2->d, tmp->d, a->d, a_width);
+    bn_select_words(tmp->d, carry, tmp->d, tmp2->d, a_width);
+    bn_select_words(B->d, both_odd & v_less_than_u, tmp->d, B->d, a_width);
+    bn_select_words(D->d, both_odd & ~v_less_than_u, tmp->d, D->d, a_width);
+
+    // Our loop invariants hold at this point. Additionally, exactly one of |u|
+    // and |v| is now even.
+    BN_ULONG u_is_even = ~word_is_odd_mask(u->d[0]);
+    BN_ULONG v_is_even = ~word_is_odd_mask(v->d[0]);
+    assert(u_is_even != v_is_even);
+
+    // Halve the even one and adjust the corresponding coefficient.
+    maybe_rshift1_words(u->d, u_is_even, tmp->d, n_width);
+    BN_ULONG A_or_B_is_odd =
+        word_is_odd_mask(A->d[0]) | word_is_odd_mask(B->d[0]);
+    BN_ULONG A_carry =
+        maybe_add_words(A->d, A_or_B_is_odd & u_is_even, n->d, tmp->d, n_width);
+    BN_ULONG B_carry =
+        maybe_add_words(B->d, A_or_B_is_odd & u_is_even, a->d, tmp->d, a_width);
+    maybe_rshift1_words_carry(A->d, A_carry, u_is_even, tmp->d, n_width);
+    maybe_rshift1_words_carry(B->d, B_carry, u_is_even, tmp->d, a_width);
+
+    maybe_rshift1_words(v->d, v_is_even, tmp->d, n_width);
+    BN_ULONG C_or_D_is_odd =
+        word_is_odd_mask(C->d[0]) | word_is_odd_mask(D->d[0]);
+    BN_ULONG C_carry =
+        maybe_add_words(C->d, C_or_D_is_odd & v_is_even, n->d, tmp->d, n_width);
+    BN_ULONG D_carry =
+        maybe_add_words(D->d, C_or_D_is_odd & v_is_even, a->d, tmp->d, a_width);
+    maybe_rshift1_words_carry(C->d, C_carry, v_is_even, tmp->d, n_width);
+    maybe_rshift1_words_carry(D->d, D_carry, v_is_even, tmp->d, a_width);
+  }
+
+  assert(BN_is_zero(v));
+  if (!BN_is_one(u)) {
+    *out_no_inverse = 1;
+    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
+    goto err;
+  }
+
+  ret = BN_copy(r, A) != NULL;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
 
 int BN_mod_inverse_odd(BIGNUM *out, int *out_no_inverse, const BIGNUM *a,
                        const BIGNUM *n, BN_CTX *ctx) {
@@ -442,7 +612,7 @@
 
   int no_inverse;
   if (!BN_is_odd(n)) {
-    if (!bn_mod_inverse_general(out, &no_inverse, a, n, ctx)) {
+    if (!bn_mod_inverse_consttime(out, &no_inverse, a, n, ctx)) {
       goto err;
     }
   } else if (!BN_mod_inverse_odd(out, &no_inverse, a, n, ctx)) {
@@ -488,139 +658,6 @@
   return ret;
 }
 
-// bn_mod_inverse_general is the general inversion algorithm that works for
-// both even and odd |n|. It was specifically designed to contain fewer
-// branches that may leak sensitive information; see "New Branch Prediction
-// Vulnerabilities in OpenSSL and Necessary Software Countermeasures" by
-// Onur Acıçmez, Shay Gueron, and Jean-Pierre Seifert.
-static int bn_mod_inverse_general(BIGNUM *out, int *out_no_inverse,
-                                  const BIGNUM *a, const BIGNUM *n,
-                                  BN_CTX *ctx) {
-  BIGNUM *A, *B, *X, *Y, *M, *D, *T;
-  int ret = 0;
-  int sign;
-
-  *out_no_inverse = 0;
-
-  BN_CTX_start(ctx);
-  A = BN_CTX_get(ctx);
-  B = BN_CTX_get(ctx);
-  X = BN_CTX_get(ctx);
-  D = BN_CTX_get(ctx);
-  M = BN_CTX_get(ctx);
-  Y = BN_CTX_get(ctx);
-  T = BN_CTX_get(ctx);
-  if (T == NULL) {
-    goto err;
-  }
-
-  BIGNUM *R = out;
-
-  BN_zero(Y);
-  if (!BN_one(X) || BN_copy(B, a) == NULL || BN_copy(A, n) == NULL) {
-    goto err;
-  }
-  A->neg = 0;
-
-  sign = -1;
-  // From  B = a mod |n|,  A = |n|  it follows that
-  //
-  //      0 <= B < A,
-  //     -sign*X*a  ==  B   (mod |n|),
-  //      sign*Y*a  ==  A   (mod |n|).
-
-  while (!BN_is_zero(B)) {
-    BIGNUM *tmp;
-
-    //      0 < B < A,
-    // (*) -sign*X*a  ==  B   (mod |n|),
-    //      sign*Y*a  ==  A   (mod |n|)
-
-    // (D, M) := (A/B, A%B) ...
-    if (!BN_div(D, M, A, B, ctx)) {
-      goto err;
-    }
-
-    // Now
-    //      A = D*B + M;
-    // thus we have
-    // (**)  sign*Y*a  ==  D*B + M   (mod |n|).
-
-    tmp = A;  // keep the BIGNUM object, the value does not matter
-
-    // (A, B) := (B, A mod B) ...
-    A = B;
-    B = M;
-    // ... so we have  0 <= B < A  again
-
-    // Since the former  M  is now  B  and the former  B  is now  A,
-    // (**) translates into
-    //       sign*Y*a  ==  D*A + B    (mod |n|),
-    // i.e.
-    //       sign*Y*a - D*A  ==  B    (mod |n|).
-    // Similarly, (*) translates into
-    //      -sign*X*a  ==  A          (mod |n|).
-    //
-    // Thus,
-    //   sign*Y*a + D*sign*X*a  ==  B  (mod |n|),
-    // i.e.
-    //        sign*(Y + D*X)*a  ==  B  (mod |n|).
-    //
-    // So if we set  (X, Y, sign) := (Y + D*X, X, -sign),  we arrive back at
-    //      -sign*X*a  ==  B   (mod |n|),
-    //       sign*Y*a  ==  A   (mod |n|).
-    // Note that  X  and  Y  stay non-negative all the time.
-
-    if (!BN_mul(tmp, D, X, ctx)) {
-      goto err;
-    }
-    if (!BN_add(tmp, tmp, Y)) {
-      goto err;
-    }
-
-    M = Y;  // keep the BIGNUM object, the value does not matter
-    Y = X;
-    X = tmp;
-    sign = -sign;
-  }
-
-  if (!BN_is_one(A)) {
-    *out_no_inverse = 1;
-    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
-    goto err;
-  }
-
-  // The while loop (Euclid's algorithm) ends when
-  //      A == gcd(a,n);
-  // we have
-  //       sign*Y*a  ==  A  (mod |n|),
-  // where  Y  is non-negative.
-
-  if (sign < 0) {
-    if (!BN_sub(Y, n, Y)) {
-      goto err;
-    }
-  }
-  // Now  Y*a  ==  A  (mod |n|).
-
-  // Y*a == 1  (mod |n|)
-  if (!Y->neg && BN_ucmp(Y, n) < 0) {
-    if (!BN_copy(R, Y)) {
-      goto err;
-    }
-  } else {
-    if (!BN_nnmod(R, Y, n, ctx)) {
-      goto err;
-    }
-  }
-
-  ret = 1;
-
-err:
-  BN_CTX_end(ctx);
-  return ret;
-}
-
 int bn_mod_inverse_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
                          BN_CTX *ctx, const BN_MONT_CTX *mont_p) {
   BN_CTX_start(ctx);
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 79e5a41..a8ad129 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -342,18 +342,6 @@
 #error "Either BN_ULLONG or BN_UMULT_LOHI must be defined on every platform."
 #endif
 
-// bn_mod_inverse_prime sets |out| to the modular inverse of |a| modulo |p|,
-// computed with Fermat's Little Theorem. It returns one on success and zero on
-// error. If |mont_p| is NULL, one will be computed temporarily.
-int bn_mod_inverse_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
-                         BN_CTX *ctx, const BN_MONT_CTX *mont_p);
-
-// bn_mod_inverse_secret_prime behaves like |bn_mod_inverse_prime| but uses
-// |BN_mod_exp_mont_consttime| instead of |BN_mod_exp_mont| in hopes of
-// protecting the exponent.
-int bn_mod_inverse_secret_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
-                                BN_CTX *ctx, const BN_MONT_CTX *mont_p);
-
 // bn_jacobi returns the Jacobi symbol of |a| and |b| (which is -1, 0 or 1), or
 // -2 on error.
 int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
@@ -463,6 +451,30 @@
 int bn_mod_lshift_consttime(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
                             BN_CTX *ctx);
 
+// bn_mod_inverse_consttime sets |r| to |a|^-1, mod |n|. |a| must be non-
+// negative and less than |n|. It returns one on success and zero on error. On
+// failure, if the failure was caused by |a| having no inverse mod |n| then
+// |*out_no_inverse| will be set to one; otherwise it will be set to zero.
+//
+// This function treats both |a| and |n| as secret, provided they are both non-
+// zero and the inverse exists. It should only be used for even moduli where
+// none of the less general implementations are applicable.
+OPENSSL_EXPORT int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse,
+                                            const BIGNUM *a, const BIGNUM *n,
+                                            BN_CTX *ctx);
+
+// bn_mod_inverse_prime sets |out| to the modular inverse of |a| modulo |p|,
+// computed with Fermat's Little Theorem. It returns one on success and zero on
+// error. If |mont_p| is NULL, one will be computed temporarily.
+int bn_mod_inverse_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
+                         BN_CTX *ctx, const BN_MONT_CTX *mont_p);
+
+// bn_mod_inverse_secret_prime behaves like |bn_mod_inverse_prime| but uses
+// |BN_mod_exp_mont_consttime| instead of |BN_mod_exp_mont| in hopes of
+// protecting the exponent.
+int bn_mod_inverse_secret_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
+                                BN_CTX *ctx, const BN_MONT_CTX *mont_p);
+
 
 // Low-level operations for small numbers.
 //