Split p224-64.c multiplication functions in three.

See I9c20b660ce4b58dc633588cfd5b2e97a40203ec3 for motivation. This aligns with
the other curves. In doing so, I removed the constant-time table lookups from
mul_public because it was easy, which gave a small performance improvement. I
did not further use ec_compute_wNAF, on the assumption that we do not care
enough about P-224 ECDSA performance to bother.

Before:
Did 63756 ECDSA P-224 verify operations in 5032477us (12668.9 ops/sec)
After:
Did 71914 ECDSA P-224 verify operations in 5042356us (14262.0 ops/sec) [+12.5%]

Change-Id: Ifd20293aca09e578c85d4692294caffc1b287909
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/36464
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/ec/p224-64.c b/crypto/fipsmodule/ec/p224-64.c
index 7ae7c59..cc88f15 100644
--- a/crypto/fipsmodule/ec/p224-64.c
+++ b/crypto/fipsmodule/ec/p224-64.c
@@ -871,95 +871,6 @@
   return (in[i >> 3] >> (i & 7)) & 1;
 }
 
-// Interleaved point multiplication using precomputed point multiples:
-// The small point multiples 0*P, 1*P, ..., 16*P are in p_pre_comp, the scalars
-// in p_scalar, if non-NULL. If g_scalar is non-NULL, we also add this multiple
-// of the generator, using certain (large) precomputed multiples in
-// g_p224_pre_comp. Output point (X, Y, Z) is stored in x_out, y_out, z_out
-static void p224_batch_mul(p224_felem x_out, p224_felem y_out, p224_felem z_out,
-                           const uint8_t *p_scalar, const uint8_t *g_scalar,
-                           const p224_felem p_pre_comp[17][3]) {
-  p224_felem nq[3], tmp[4];
-  uint64_t bits;
-  uint8_t sign, digit;
-
-  // set nq to the point at infinity
-  OPENSSL_memset(nq, 0, 3 * sizeof(p224_felem));
-
-  // Loop over both scalars msb-to-lsb, interleaving additions of multiples of
-  // the generator (two in each of the last 28 rounds) and additions of p (every
-  // 5th round).
-  int skip = 1;  // save two point operations in the first round
-  size_t i = p_scalar != NULL ? 220 : 27;
-  for (;;) {
-    // double
-    if (!skip) {
-      p224_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
-    }
-
-    // add multiples of the generator
-    if (g_scalar != NULL && i <= 27) {
-      // first, look 28 bits upwards
-      bits = p224_get_bit(g_scalar, i + 196) << 3;
-      bits |= p224_get_bit(g_scalar, i + 140) << 2;
-      bits |= p224_get_bit(g_scalar, i + 84) << 1;
-      bits |= p224_get_bit(g_scalar, i + 28);
-      // select the point to add, in constant time
-      p224_select_point(bits, 16, g_p224_pre_comp[1], tmp);
-
-      if (!skip) {
-        p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
-                       tmp[0], tmp[1], tmp[2]);
-      } else {
-        OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem));
-        skip = 0;
-      }
-
-      // second, look at the current position
-      bits = p224_get_bit(g_scalar, i + 168) << 3;
-      bits |= p224_get_bit(g_scalar, i + 112) << 2;
-      bits |= p224_get_bit(g_scalar, i + 56) << 1;
-      bits |= p224_get_bit(g_scalar, i);
-      // select the point to add, in constant time
-      p224_select_point(bits, 16, g_p224_pre_comp[0], tmp);
-      p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
-                     tmp[0], tmp[1], tmp[2]);
-    }
-
-    // do other additions every 5 doublings
-    if (p_scalar != NULL && i % 5 == 0) {
-      bits = p224_get_bit(p_scalar, i + 4) << 5;
-      bits |= p224_get_bit(p_scalar, i + 3) << 4;
-      bits |= p224_get_bit(p_scalar, i + 2) << 3;
-      bits |= p224_get_bit(p_scalar, i + 1) << 2;
-      bits |= p224_get_bit(p_scalar, i) << 1;
-      bits |= p224_get_bit(p_scalar, i - 1);
-      ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
-
-      // select the point to add or subtract
-      p224_select_point(digit, 17, p_pre_comp, tmp);
-      p224_felem_neg(tmp[3], tmp[1]);  // (X, -Y, Z) is the negative point
-      p224_copy_conditional(tmp[1], tmp[3], sign);
-
-      if (!skip) {
-        p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */,
-                       tmp[0], tmp[1], tmp[2]);
-      } else {
-        OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem));
-        skip = 0;
-      }
-    }
-
-    if (i == 0) {
-      break;
-    }
-    --i;
-  }
-  p224_felem_assign(x_out, nq[0]);
-  p224_felem_assign(y_out, nq[1]);
-  p224_felem_assign(z_out, nq[2]);
-}
-
 // Takes the Jacobian coordinates (X, Y, Z) of a point and returns
 // (X', Y') = (X/Z^2, Y/Z^3)
 static int ec_GFp_nistp224_point_get_affine_coordinates(
@@ -1027,67 +938,197 @@
   p224_felem_to_generic(&r->Z, z);
 }
 
-static void ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_RAW_POINT *r,
-                                       const EC_SCALAR *g_scalar,
-                                       const EC_RAW_POINT *p,
-                                       const EC_SCALAR *p_scalar) {
-  // Note this function must act in constant-time when exactly one of |g_scalar|
-  // and |p_scalar| is non-NULL. If both are non-NULL, this is not necessary and
-  // we'll hit the variable-time doubling case in |p224_point_add|.
-  //
-  // TODO(davidben): Split this like the other curves to ease analysis.
-  p224_felem p_pre_comp[17][3];
-  p224_felem x_out, y_out, z_out;
+static void ec_GFp_nistp224_make_precomp(p224_felem out[17][3],
+                                         const EC_RAW_POINT *p) {
+  OPENSSL_memset(out[0], 0, sizeof(p224_felem) * 3);
 
-  if (p != NULL && p_scalar != NULL) {
-    // We treat NULL scalars as 0, and NULL points as points at infinity, i.e.,
-    // they contribute nothing to the linear combination.
-    OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp));
-    // precompute multiples
-    p224_generic_to_felem(x_out, &p->X);
-    p224_generic_to_felem(y_out, &p->Y);
-    p224_generic_to_felem(z_out, &p->Z);
+  p224_generic_to_felem(out[1][0], &p->X);
+  p224_generic_to_felem(out[1][1], &p->Y);
+  p224_generic_to_felem(out[1][2], &p->Z);
 
-    p224_felem_assign(p_pre_comp[1][0], x_out);
-    p224_felem_assign(p_pre_comp[1][1], y_out);
-    p224_felem_assign(p_pre_comp[1][2], z_out);
-
-    for (size_t j = 2; j <= 16; ++j) {
-      if (j & 1) {
-        p224_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
-                       p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2], 0,
-                       p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
-                       p_pre_comp[j - 1][2]);
-      } else {
-        p224_point_double(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
-                          p_pre_comp[j / 2][0], p_pre_comp[j / 2][1],
-                          p_pre_comp[j / 2][2]);
-      }
+  for (size_t j = 2; j <= 16; ++j) {
+    if (j & 1) {
+      p224_point_add(out[j][0], out[j][1], out[j][2], out[1][0], out[1][1],
+                     out[1][2], 0, out[j - 1][0], out[j - 1][1], out[j - 1][2]);
+    } else {
+      p224_point_double(out[j][0], out[j][1], out[j][2], out[j / 2][0],
+                        out[j / 2][1], out[j / 2][2]);
     }
   }
-
-  p224_batch_mul(x_out, y_out, z_out,
-                 (p != NULL && p_scalar != NULL) ? p_scalar->bytes : NULL,
-                 g_scalar != NULL ? g_scalar->bytes : NULL,
-                 (const p224_felem(*)[3])p_pre_comp);
-
-  // reduce the output to its unique minimal representation
-  p224_felem_to_generic(&r->X, x_out);
-  p224_felem_to_generic(&r->Y, y_out);
-  p224_felem_to_generic(&r->Z, z_out);
 }
 
 static void ec_GFp_nistp224_point_mul(const EC_GROUP *group, EC_RAW_POINT *r,
                                       const EC_RAW_POINT *p,
                                       const EC_SCALAR *scalar) {
-  ec_GFp_nistp224_points_mul(group, r, NULL /* g_scalar */, p, scalar);
+  p224_felem p_pre_comp[17][3];
+  ec_GFp_nistp224_make_precomp(p_pre_comp, p);
+
+  // Set nq to the point at infinity.
+  p224_felem nq[3], tmp[4];
+  OPENSSL_memset(nq, 0, 3 * sizeof(p224_felem));
+
+  int skip = 1;  // Save two point operations in the first round.
+  for (size_t i = 220; i < 221; i--) {
+    if (!skip) {
+      p224_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+    }
+
+    // Add every 5 doublings.
+    if (i % 5 == 0) {
+      uint64_t bits = p224_get_bit(scalar->bytes, i + 4) << 5;
+      bits |= p224_get_bit(scalar->bytes, i + 3) << 4;
+      bits |= p224_get_bit(scalar->bytes, i + 2) << 3;
+      bits |= p224_get_bit(scalar->bytes, i + 1) << 2;
+      bits |= p224_get_bit(scalar->bytes, i) << 1;
+      bits |= p224_get_bit(scalar->bytes, i - 1);
+      uint8_t sign, digit;
+      ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
+
+      // Select the point to add or subtract.
+      p224_select_point(digit, 17, (const p224_felem(*)[3])p_pre_comp, tmp);
+      p224_felem_neg(tmp[3], tmp[1]);  // (X, -Y, Z) is the negative point
+      p224_copy_conditional(tmp[1], tmp[3], sign);
+
+      if (!skip) {
+        p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */,
+                       tmp[0], tmp[1], tmp[2]);
+      } else {
+        OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem));
+        skip = 0;
+      }
+    }
+  }
+
+  // Reduce the output to its unique minimal representation.
+  p224_felem_to_generic(&r->X, nq[0]);
+  p224_felem_to_generic(&r->Y, nq[1]);
+  p224_felem_to_generic(&r->Z, nq[2]);
 }
 
 static void ec_GFp_nistp224_point_mul_base(const EC_GROUP *group,
                                            EC_RAW_POINT *r,
                                            const EC_SCALAR *scalar) {
-  ec_GFp_nistp224_points_mul(group, r, scalar, NULL /* p */,
-                             NULL /* p_scalar */);
+  // Set nq to the point at infinity.
+  p224_felem nq[3], tmp[3];
+  OPENSSL_memset(nq, 0, 3 * sizeof(p224_felem));
+
+  int skip = 1;  // Save two point operations in the first round.
+  for (size_t i = 27; i < 28; i--) {
+    // double
+    if (!skip) {
+      p224_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+    }
+
+    // First, look 28 bits upwards.
+    uint64_t bits = p224_get_bit(scalar->bytes, i + 196) << 3;
+    bits |= p224_get_bit(scalar->bytes, i + 140) << 2;
+    bits |= p224_get_bit(scalar->bytes, i + 84) << 1;
+    bits |= p224_get_bit(scalar->bytes, i + 28);
+    // Select the point to add, in constant time.
+    p224_select_point(bits, 16, g_p224_pre_comp[1], tmp);
+
+    if (!skip) {
+      p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
+                     tmp[0], tmp[1], tmp[2]);
+    } else {
+      OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem));
+      skip = 0;
+    }
+
+    // Second, look at the current position/
+    bits = p224_get_bit(scalar->bytes, i + 168) << 3;
+    bits |= p224_get_bit(scalar->bytes, i + 112) << 2;
+    bits |= p224_get_bit(scalar->bytes, i + 56) << 1;
+    bits |= p224_get_bit(scalar->bytes, i);
+    // Select the point to add, in constant time.
+    p224_select_point(bits, 16, g_p224_pre_comp[0], tmp);
+    p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
+                   tmp[0], tmp[1], tmp[2]);
+  }
+
+  // Reduce the output to its unique minimal representation.
+  p224_felem_to_generic(&r->X, nq[0]);
+  p224_felem_to_generic(&r->Y, nq[1]);
+  p224_felem_to_generic(&r->Z, nq[2]);
+}
+
+static void ec_GFp_nistp224_point_mul_public(const EC_GROUP *group,
+                                             EC_RAW_POINT *r,
+                                             const EC_SCALAR *g_scalar,
+                                             const EC_RAW_POINT *p,
+                                             const EC_SCALAR *p_scalar) {
+  // TODO(davidben): If P-224 ECDSA verify performance ever matters, using
+  // |ec_compute_wNAF| for |p_scalar| would likely be an easy improvement.
+  p224_felem p_pre_comp[17][3];
+  ec_GFp_nistp224_make_precomp(p_pre_comp, p);
+
+  // Set nq to the point at infinity.
+  p224_felem nq[3], tmp[3];
+  OPENSSL_memset(nq, 0, 3 * sizeof(p224_felem));
+
+  // Loop over both scalars msb-to-lsb, interleaving additions of multiples of
+  // the generator (two in each of the last 28 rounds) and additions of p (every
+  // 5th round).
+  int skip = 1;  // Save two point operations in the first round.
+  for (size_t i = 220; i < 221; i--) {
+    if (!skip) {
+      p224_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+    }
+
+    // Add multiples of the generator.
+    if (i <= 27) {
+      // First, look 28 bits upwards.
+      uint64_t bits = p224_get_bit(g_scalar->bytes, i + 196) << 3;
+      bits |= p224_get_bit(g_scalar->bytes, i + 140) << 2;
+      bits |= p224_get_bit(g_scalar->bytes, i + 84) << 1;
+      bits |= p224_get_bit(g_scalar->bytes, i + 28);
+
+      p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
+                     g_p224_pre_comp[1][bits][0], g_p224_pre_comp[1][bits][1],
+                     g_p224_pre_comp[1][bits][2]);
+      assert(!skip);
+
+      // Second, look at the current position.
+      bits = p224_get_bit(g_scalar->bytes, i + 168) << 3;
+      bits |= p224_get_bit(g_scalar->bytes, i + 112) << 2;
+      bits |= p224_get_bit(g_scalar->bytes, i + 56) << 1;
+      bits |= p224_get_bit(g_scalar->bytes, i);
+      p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
+                     g_p224_pre_comp[0][bits][0], g_p224_pre_comp[0][bits][1],
+                     g_p224_pre_comp[0][bits][2]);
+    }
+
+    // Incorporate |p_scalar| every 5 doublings.
+    if (i % 5 == 0) {
+      uint64_t bits = p224_get_bit(p_scalar->bytes, i + 4) << 5;
+      bits |= p224_get_bit(p_scalar->bytes, i + 3) << 4;
+      bits |= p224_get_bit(p_scalar->bytes, i + 2) << 3;
+      bits |= p224_get_bit(p_scalar->bytes, i + 1) << 2;
+      bits |= p224_get_bit(p_scalar->bytes, i) << 1;
+      bits |= p224_get_bit(p_scalar->bytes, i - 1);
+      uint8_t sign, digit;
+      ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
+
+      // Select the point to add or subtract.
+      OPENSSL_memcpy(tmp, p_pre_comp[digit], 3 * sizeof(p224_felem));
+      if (sign) {
+        p224_felem_neg(tmp[1], tmp[1]);  // (X, -Y, Z) is the negative point
+      }
+
+      if (!skip) {
+        p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */,
+                       tmp[0], tmp[1], tmp[2]);
+      } else {
+        OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem));
+        skip = 0;
+      }
+    }
+  }
+
+  // Reduce the output to its unique minimal representation.
+  p224_felem_to_generic(&r->X, nq[0]);
+  p224_felem_to_generic(&r->Y, nq[1]);
+  p224_felem_to_generic(&r->Z, nq[2]);
 }
 
 static void ec_GFp_nistp224_felem_mul(const EC_GROUP *group, EC_FELEM *r,
@@ -1131,7 +1172,7 @@
   out->dbl = ec_GFp_nistp224_dbl;
   out->mul = ec_GFp_nistp224_point_mul;
   out->mul_base = ec_GFp_nistp224_point_mul_base;
-  out->mul_public = ec_GFp_nistp224_points_mul;
+  out->mul_public = ec_GFp_nistp224_point_mul_public;
   out->felem_mul = ec_GFp_nistp224_felem_mul;
   out->felem_sqr = ec_GFp_nistp224_felem_sqr;
   out->bignum_to_felem = ec_GFp_nistp224_bignum_to_felem;