Implement constant-time generic multiplication.

This is slower, but constant-time. It intentionally omits the signed
digit optimization because we cannot be sure the doubling case will be
unreachable for all curves. This is a fallback generic implementation
for curves which we must support for compatibility but which are not
common or important enough to justify curve-specific work.

Before:
Did 814 ECDH P-384 operations in 1085384us (750.0 ops/sec)
Did 1430 ECDSA P-384 signing operations in 1081988us (1321.6 ops/sec)
Did 308 ECDH P-521 operations in 1057741us (291.2 ops/sec)
Did 539 ECDSA P-521 signing operations in 1049797us (513.4 ops/sec)

After:
Did 715 ECDH P-384 operations in 1080161us (661.9 ops/sec)
Did 1188 ECDSA P-384 verify operations in 1069567us (1110.7 ops/sec)
Did 275 ECDH P-521 operations in 1060503us (259.3 ops/sec)
Did 506 ECDSA P-521 signing operations in 1084739us (466.5 ops/sec)

But we're still faster than the old BIGNUM implementation. EC_FELEM
more than paid for both the loss of points_make_affine and this CL.

Bug: 239
Change-Id: I65d71a731aad16b523928ee47618822d503ea704
Reviewed-on: https://boringssl-review.googlesource.com/27708
Commit-Queue: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bcm.c b/crypto/fipsmodule/bcm.c
index 5d9ebfc..51ac7d5 100644
--- a/crypto/fipsmodule/bcm.c
+++ b/crypto/fipsmodule/bcm.c
@@ -66,6 +66,7 @@
 #include "ec/p256-x86_64.c"
 #include "ec/scalar.c"
 #include "ec/simple.c"
+#include "ec/simple_mul.c"
 #include "ec/util.c"
 #include "ec/wnaf.c"
 #include "hmac/hmac.c"
diff --git a/crypto/fipsmodule/ec/ec_montgomery.c b/crypto/fipsmodule/ec/ec_montgomery.c
index eb37e50..9eace95 100644
--- a/crypto/fipsmodule/ec/ec_montgomery.c
+++ b/crypto/fipsmodule/ec/ec_montgomery.c
@@ -225,8 +225,8 @@
   out->group_finish = ec_GFp_mont_group_finish;
   out->group_set_curve = ec_GFp_mont_group_set_curve;
   out->point_get_affine_coordinates = ec_GFp_mont_point_get_affine_coordinates;
-  out->mul = ec_wNAF_mul /* XXX: Not constant time. */;
-  out->mul_public = ec_wNAF_mul;
+  out->mul = ec_GFp_simple_mul;
+  out->mul_public = ec_GFp_simple_mul_public;
   out->felem_mul = ec_GFp_mont_felem_mul;
   out->felem_sqr = ec_GFp_mont_felem_sqr;
   out->bignum_to_felem = ec_GFp_mont_bignum_to_felem;
diff --git a/crypto/fipsmodule/ec/internal.h b/crypto/fipsmodule/ec/internal.h
index aba7d1d..bb172b2 100644
--- a/crypto/fipsmodule/ec/internal.h
+++ b/crypto/fipsmodule/ec/internal.h
@@ -287,6 +287,10 @@
     const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
     const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);
 
+void ec_GFp_simple_mul(const EC_GROUP *group, EC_RAW_POINT *r,
+                       const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
+                       const EC_SCALAR *p_scalar);
+
 // ec_compute_wNAF writes the modified width-(w+1) Non-Adjacent Form (wNAF) of
 // |scalar| to |out|. |out| must have room for |bits| + 1 elements, each of
 // which will be either zero or odd with an absolute value less than  2^w
@@ -298,9 +302,9 @@
 void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
                      const EC_SCALAR *scalar, size_t bits, int w);
 
-void ec_wNAF_mul(const EC_GROUP *group, EC_RAW_POINT *r,
-                 const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
-                 const EC_SCALAR *p_scalar);
+void ec_GFp_simple_mul_public(const EC_GROUP *group, EC_RAW_POINT *r,
+                              const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
+                              const EC_SCALAR *p_scalar);
 
 // method functions in simple.c
 int ec_GFp_simple_group_init(EC_GROUP *);
diff --git a/crypto/fipsmodule/ec/simple.c b/crypto/fipsmodule/ec/simple.c
index 8bbb5a9..5c63711 100644
--- a/crypto/fipsmodule/ec/simple.c
+++ b/crypto/fipsmodule/ec/simple.c
@@ -287,9 +287,7 @@
 
   BN_ULONG yneq = ec_felem_non_zero_mask(group, &r);
 
-  // TODO(davidben): Analyze how case relates to timing considerations for the
-  // supported curves which hit it (P-224, P-384, and P-521) and the
-  // to-be-written constant-time generic multiplication implementation.
+  // This case will never occur in the constant-time |ec_GFp_simple_mul|.
   if (!xneq && !yneq && z1nz && z2nz) {
     ec_GFp_simple_dbl(group, out, a);
     return;
diff --git a/crypto/fipsmodule/ec/simple_mul.c b/crypto/fipsmodule/ec/simple_mul.c
new file mode 100644
index 0000000..394f3199
--- /dev/null
+++ b/crypto/fipsmodule/ec/simple_mul.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/ec.h>
+
+#include <assert.h>
+
+#include "internal.h"
+#include "../bn/internal.h"
+#include "../../internal.h"
+
+
+static void ec_GFp_simple_mul_single(const EC_GROUP *group, EC_RAW_POINT *r,
+                                     const EC_RAW_POINT *p,
+                                     const EC_SCALAR *scalar) {
+  // This is a generic implementation for uncommon curves that not do not
+  // warrant a tuned one. It uses unsigned digits so that the doubling case in
+  // |ec_GFp_simple_add| is always unreachable, erring on safety and simplicity.
+
+  // Compute a table of the first 32 multiples of |p| (including infinity).
+  EC_RAW_POINT precomp[32];
+  ec_GFp_simple_point_set_to_infinity(group, &precomp[0]);
+  ec_GFp_simple_point_copy(&precomp[1], p);
+  for (size_t j = 2; j < OPENSSL_ARRAY_SIZE(precomp); j++) {
+    if (j & 1) {
+      ec_GFp_simple_add(group, &precomp[j], &precomp[1], &precomp[j - 1]);
+    } else {
+      ec_GFp_simple_dbl(group, &precomp[j], &precomp[j / 2]);
+    }
+  }
+
+  // Divide bits in |scalar| into windows.
+  unsigned bits = BN_num_bits(&group->order);
+  int r_is_at_infinity = 1;
+  for (unsigned i = bits - 1; i < bits; i--) {
+    if (!r_is_at_infinity) {
+      ec_GFp_simple_dbl(group, r, r);
+    }
+    if (i % 5 == 0) {
+      // Compute the next window value.
+      const size_t width = group->order.width;
+      uint8_t window = bn_is_bit_set_words(scalar->words, width, i + 4) << 4;
+      window |= bn_is_bit_set_words(scalar->words, width, i + 3) << 3;
+      window |= bn_is_bit_set_words(scalar->words, width, i + 2) << 2;
+      window |= bn_is_bit_set_words(scalar->words, width, i + 1) << 1;
+      window |= bn_is_bit_set_words(scalar->words, width, i);
+
+      // Select the entry in constant-time.
+      EC_RAW_POINT tmp;
+      for (size_t j = 0; j < OPENSSL_ARRAY_SIZE(precomp); j++) {
+        BN_ULONG mask = constant_time_eq_w(j, window);
+        ec_felem_select(group, &tmp.X, mask, &precomp[j].X, &tmp.X);
+        ec_felem_select(group, &tmp.Y, mask, &precomp[j].Y, &tmp.Y);
+        ec_felem_select(group, &tmp.Z, mask, &precomp[j].Z, &tmp.Z);
+      }
+
+      if (r_is_at_infinity) {
+        ec_GFp_simple_point_copy(r, &tmp);
+        r_is_at_infinity = 0;
+      } else {
+        ec_GFp_simple_add(group, r, r, &tmp);
+      }
+    }
+  }
+  if (r_is_at_infinity) {
+    ec_GFp_simple_point_set_to_infinity(group, r);
+  }
+}
+
+void ec_GFp_simple_mul(const EC_GROUP *group, EC_RAW_POINT *r,
+                       const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
+                       const EC_SCALAR *p_scalar) {
+  assert(g_scalar != NULL || p_scalar != NULL);
+  if (p_scalar == NULL) {
+    ec_GFp_simple_mul_single(group, r, &group->generator->raw, g_scalar);
+  } else if (g_scalar == NULL) {
+    ec_GFp_simple_mul_single(group, r, p, p_scalar);
+  } else {
+    // Support constant-time two-point multiplication for compatibility.  This
+    // does not actually come up in keygen, ECDH, or ECDSA, so we implement it
+    // the naive way.
+    ec_GFp_simple_mul_single(group, r, &group->generator->raw, g_scalar);
+    EC_RAW_POINT tmp;
+    ec_GFp_simple_mul_single(group, &tmp, p, p_scalar);
+    ec_GFp_simple_add(group, r, r, &tmp);
+  }
+}
diff --git a/crypto/fipsmodule/ec/wnaf.c b/crypto/fipsmodule/ec/wnaf.c
index e136079..145caa0 100644
--- a/crypto/fipsmodule/ec/wnaf.c
+++ b/crypto/fipsmodule/ec/wnaf.c
@@ -72,7 +72,6 @@
 
 #include <openssl/bn.h>
 #include <openssl/err.h>
-#include <openssl/mem.h>
 #include <openssl/thread.h>
 
 #include "internal.h"
@@ -169,37 +168,30 @@
   }
 }
 
-// EC_WNAF_WINDOW_BITS is the window size to use for |ec_wNAF_mul|.
+// EC_WNAF_WINDOW_BITS is the window size to use for |ec_GFp_simple_mul_public|.
 #define EC_WNAF_WINDOW_BITS 4
 
-// EC_WNAF_TABLE_SIZE is the table size to use for |ec_wNAF_mul|.
+// EC_WNAF_TABLE_SIZE is the table size to use for |ec_GFp_simple_mul_public|.
 #define EC_WNAF_TABLE_SIZE (1 << (EC_WNAF_WINDOW_BITS - 1))
 
-void ec_wNAF_mul(const EC_GROUP *group, EC_RAW_POINT *r,
-                 const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
-                 const EC_SCALAR *p_scalar) {
+void ec_GFp_simple_mul_public(const EC_GROUP *group, EC_RAW_POINT *r,
+                              const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
+                              const EC_SCALAR *p_scalar) {
   size_t bits = BN_num_bits(&group->order);
   size_t wNAF_len = bits + 1;
 
-  // TODO(davidben): |mul_public| is for ECDSA verification which can assume
-  // non-NULL inputs, but this code is also used for |mul| which cannot. It's
-  // not constant-time, so replace the generic |mul| and remove the NULL checks.
   int8_t g_wNAF[EC_MAX_SCALAR_BYTES * 8 + 1];
   EC_RAW_POINT g_precomp[EC_WNAF_TABLE_SIZE];
   assert(wNAF_len <= OPENSSL_ARRAY_SIZE(g_wNAF));
-  if (g_scalar != NULL) {
-    const EC_RAW_POINT *g = &group->generator->raw;
-    ec_compute_wNAF(group, g_wNAF, g_scalar, bits, EC_WNAF_WINDOW_BITS);
-    compute_precomp(group, g_precomp, g, EC_WNAF_TABLE_SIZE);
-  }
+  const EC_RAW_POINT *g = &group->generator->raw;
+  ec_compute_wNAF(group, g_wNAF, g_scalar, bits, EC_WNAF_WINDOW_BITS);
+  compute_precomp(group, g_precomp, g, EC_WNAF_TABLE_SIZE);
 
   int8_t p_wNAF[EC_MAX_SCALAR_BYTES * 8 + 1];
   EC_RAW_POINT p_precomp[EC_WNAF_TABLE_SIZE];
   assert(wNAF_len <= OPENSSL_ARRAY_SIZE(p_wNAF));
-  if (p_scalar != NULL) {
-    ec_compute_wNAF(group, p_wNAF, p_scalar, bits, EC_WNAF_WINDOW_BITS);
-    compute_precomp(group, p_precomp, p, EC_WNAF_TABLE_SIZE);
-  }
+  ec_compute_wNAF(group, p_wNAF, p_scalar, bits, EC_WNAF_WINDOW_BITS);
+  compute_precomp(group, p_precomp, p, EC_WNAF_TABLE_SIZE);
 
   EC_RAW_POINT tmp;
   int r_is_at_infinity = 1;
@@ -208,7 +200,7 @@
       ec_GFp_simple_dbl(group, r, r);
     }
 
-    if (g_scalar != NULL && g_wNAF[k] != 0) {
+    if (g_wNAF[k] != 0) {
       lookup_precomp(group, &tmp, g_precomp, g_wNAF[k]);
       if (r_is_at_infinity) {
         ec_GFp_simple_point_copy(r, &tmp);
@@ -218,7 +210,7 @@
       }
     }
 
-    if (p_scalar != NULL && p_wNAF[k] != 0) {
+    if (p_wNAF[k] != 0) {
       lookup_precomp(group, &tmp, p_precomp, p_wNAF[k]);
       if (r_is_at_infinity) {
         ec_GFp_simple_point_copy(r, &tmp);
@@ -232,7 +224,4 @@
   if (r_is_at_infinity) {
     ec_GFp_simple_point_set_to_infinity(group, r);
   }
-
-  OPENSSL_cleanse(&g_wNAF, sizeof(g_wNAF));
-  OPENSSL_cleanse(&p_wNAF, sizeof(p_wNAF));
 }