Remove EC_LOOSE_SCALAR.

ECDSA converts digests to scalars by taking the leftmost n bits, where n
is the number of bits in the group order. This does not necessarily
produce a fully-reduced scalar.

Montgomery multiplication actually tolerates this slightly looser bound,
so we did not bother with the conditional subtraction. However, this
subtraction is free compared to the multiplication, inversion, and base
point multiplication. Simplify things by keeping it fully-reduced.

Change-Id: If49dffefccc21510f40418dc52ea4da7e3ff198f
Reviewed-on: https://boringssl-review.googlesource.com/26968
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/div.c b/crypto/fipsmodule/bn/div.c
index d29ea34..1950561 100644
--- a/crypto/fipsmodule/bn/div.c
+++ b/crypto/fipsmodule/bn/div.c
@@ -455,11 +455,8 @@
   bn_select_words(r, 0 - borrow, tmp /* r < 0 */, r /* r >= 0 */, num);
 }
 
-// bn_mod_add_words sets |r| to |a| + |b| (mod |m|), using |tmp| as scratch
-// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
-// |r|, |a|, and |b| may alias.
-static void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
-                             const BN_ULONG *m, BN_ULONG *tmp, size_t num) {
+void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                      const BN_ULONG *m, BN_ULONG *tmp, size_t num) {
   BN_ULONG carry = bn_add_words(r, a, b, num);
   bn_reduce_once_in_place(r, carry, m, tmp, num);
 }
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 8c121a0..668d8dd 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -454,6 +454,12 @@
 //
 // The following functions implement basic constant-time modular arithmetic.
 
+// bn_mod_add_words sets |r| to |a| + |b| (mod |m|), using |tmp| as scratch
+// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
+// |r|, |a|, and |b| may alias.
+void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                      const BN_ULONG *m, BN_ULONG *tmp, size_t num);
+
 // bn_mod_add_consttime acts like |BN_mod_add_quick| but takes a |BN_CTX|.
 int bn_mod_add_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
                          const BIGNUM *m, BN_CTX *ctx);
diff --git a/crypto/fipsmodule/ecdsa/ecdsa.c b/crypto/fipsmodule/ecdsa/ecdsa.c
index 2f73718..85490fa 100644
--- a/crypto/fipsmodule/ecdsa/ecdsa.c
+++ b/crypto/fipsmodule/ecdsa/ecdsa.c
@@ -66,42 +66,16 @@
 #include "../../internal.h"
 
 
-// EC_LOOSE_SCALAR is like |EC_SCALAR| but is bounded by 2^|BN_num_bits(order)|
-// rather than |order|.
-typedef union {
-  // bytes is the representation of the scalar in little-endian order.
-  uint8_t bytes[EC_MAX_SCALAR_BYTES];
-  BN_ULONG words[EC_MAX_SCALAR_WORDS];
-} EC_LOOSE_SCALAR;
-
-static void scalar_add_loose(const EC_GROUP *group, EC_LOOSE_SCALAR *r,
-                             const EC_LOOSE_SCALAR *a, const EC_SCALAR *b) {
-  // Add and subtract one copy of |order| if necessary. We have:
-  //   |a| + |b| < 2^BN_num_bits(order) + order
-  // so this leaves |r| < 2^BN_num_bits(order).
+static void scalar_add(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a,
+                       const EC_SCALAR *b) {
   const BIGNUM *order = &group->order;
-  BN_ULONG carry = bn_add_words(r->words, a->words, b->words, order->width);
-  EC_LOOSE_SCALAR tmp;
-  BN_ULONG v =
-      bn_sub_words(tmp.words, r->words, order->d, order->width) - carry;
-  bn_select_words(r->words, 0u - v, r->words /* tmp < 0 */,
-                  tmp.words /* tmp >= 0 */, order->width);
+  BN_ULONG tmp[EC_MAX_SCALAR_WORDS];
+  bn_mod_add_words(r->words, a->words, b->words, order->d, tmp, order->width);
+  OPENSSL_cleanse(tmp, sizeof(tmp));
 }
 
-static int scalar_mod_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r,
-                                     const EC_SCALAR *a, const EC_SCALAR *b) {
-  const BIGNUM *order = &group->order;
-  return bn_mod_mul_montgomery_small(r->words, order->width, a->words,
-                                     order->width, b->words, order->width,
-                                     group->order_mont);
-}
-
-static int scalar_mod_mul_montgomery_loose(const EC_GROUP *group, EC_SCALAR *r,
-                                           const EC_LOOSE_SCALAR *a,
-                                           const EC_SCALAR *b) {
-  // Although |a| is loose, |bn_mod_mul_montgomery_small| only requires the
-  // product not exceed R * |order|. |b| is fully reduced and |a| <
-  // 2^BN_num_bits(order) <= R, so this holds.
+static int scalar_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                                 const EC_SCALAR *a, const EC_SCALAR *b) {
   const BIGNUM *order = &group->order;
   return bn_mod_mul_montgomery_small(r->words, order->width, a->words,
                                      order->width, b->words, order->width,
@@ -111,7 +85,7 @@
 // digest_to_scalar interprets |digest_len| bytes from |digest| as a scalar for
 // ECDSA. Note this value is not fully reduced modulo the order, only the
 // correct number of bits.
-static void digest_to_scalar(const EC_GROUP *group, EC_LOOSE_SCALAR *out,
+static void digest_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
                              const uint8_t *digest, size_t digest_len) {
   const BIGNUM *order = &group->order;
   size_t num_bits = BN_num_bits(order);
@@ -129,6 +103,15 @@
   if (8 * digest_len > num_bits) {
     bn_rshift_words(out->words, out->words, 8 - (num_bits & 0x7), order->width);
   }
+
+  // |out| now has the same bit width as |order|, but this only bounds by
+  // 2*|order|. Subtract the order if out of range.
+  //
+  // Montgomery multiplication accepts the looser bounds, so this isn't strictly
+  // necessary, but it is a cleaner abstraction and has no performance impact.
+  BN_ULONG tmp[EC_MAX_SCALAR_WORDS];
+  bn_reduce_once_in_place(out->words, 0 /* no carry */, order->d, tmp,
+                          order->width);
 }
 
 // field_element_to_scalar reduces |r| modulo |group->order|. |r| must
@@ -233,8 +216,7 @@
     goto err;
   }
 
-  EC_SCALAR r, s, u1, u2, s_inv_mont;
-  EC_LOOSE_SCALAR m;
+  EC_SCALAR r, s, u1, u2, s_inv_mont, m;
   const BIGNUM *order = EC_GROUP_get0_order(group);
   if (BN_is_zero(sig->r) ||
       !ec_bignum_to_scalar(group, &r, sig->r) ||
@@ -260,8 +242,8 @@
   // |s_inv_mont| is in Montgomery form while |m| and |r| are not, so |u1| and
   // |u2| will be taken out of Montgomery form, as desired.
   digest_to_scalar(group, &m, digest, digest_len);
-  if (!scalar_mod_mul_montgomery_loose(group, &u1, &m, &s_inv_mont) ||
-      !scalar_mod_mul_montgomery(group, &u2, &r, &s_inv_mont)) {
+  if (!scalar_mul_montgomery(group, &u1, &m, &s_inv_mont) ||
+      !scalar_mul_montgomery(group, &u2, &r, &s_inv_mont)) {
     goto err;
   }
 
@@ -398,8 +380,7 @@
   int ok = 0;
   ECDSA_SIG *ret = ECDSA_SIG_new();
   BN_CTX *ctx = BN_CTX_new();
-  EC_SCALAR kinv_mont, r_mont, s;
-  EC_LOOSE_SCALAR m, tmp;
+  EC_SCALAR kinv_mont, r_mont, s, m, tmp;
   if (ret == NULL || ctx == NULL) {
     OPENSSL_PUT_ERROR(ECDSA, ERR_R_MALLOC_FAILURE);
     return NULL;
@@ -418,16 +399,16 @@
     if (!ec_bignum_to_scalar(group, &r_mont, ret->r) ||
         !bn_to_montgomery_small(r_mont.words, order->width, r_mont.words,
                                 order->width, group->order_mont) ||
-        !scalar_mod_mul_montgomery(group, &s, priv_key, &r_mont)) {
+        !scalar_mul_montgomery(group, &s, priv_key, &r_mont)) {
       goto err;
     }
 
     // Compute tmp = m + priv_key * r.
-    scalar_add_loose(group, &tmp, &m, &s);
+    scalar_add(group, &tmp, &m, &s);
 
     // Finally, multiply s by k^-1. That was retained in Montgomery form, so the
     // same technique as the previous multiplication works.
-    if (!scalar_mod_mul_montgomery_loose(group, &s, &tmp, &kinv_mont) ||
+    if (!scalar_mul_montgomery(group, &s, &tmp, &kinv_mont) ||
         !bn_set_words(ret->s, s.words, order->width)) {
       goto err;
     }