Make generic point arithmetic slightly less variable-time.

The generic code special-cases affine points, but this leaks
information. (Of course, the generic code also doesn't have a
constant-time multiply and other problems, but one thing at a time.)

The optimization in point doubling is not useful. Point multiplication
more-or-less never doubles an affine point. The optimization in point
addition *is* useful because the wNAF code converts the tables to
affine. Accordingly, align with the P-256 code which adds a 'mixed'
parameter.

(I haven't aligned the formally-verified point formulas themselves yet;
initial testing suggests that the large number of temporaries take a
perf hit with BIGNUM. I'll check the results in EC_FELEM, which will be
stack-allocated, to see if we still need to help the compiler out.)

Strangly, it actually got a bit faster with this change. I'm guessing
because now it doesn't need to bother with unnecessary comparisons and
maybe was kinder to the branch predictor?

Before:
Did 2201 ECDH P-384 operations in 3068341us (717.3 ops/sec)
Did 4092 ECDSA P-384 signing operations in 3076981us (1329.9 ops/sec)
Did 3503 ECDSA P-384 verify operations in 3024753us (1158.1 ops/sec)
Did 992 ECDH P-521 operations in 3017884us (328.7 ops/sec)
Did 1798 ECDSA P-521 signing operations in 3059000us (587.8 ops/sec)
Did 1581 ECDSA P-521 verify operations in 3033142us (521.2 ops/sec)

After:
Did 2310 ECDH P-384 operations in 3092648us (746.9 ops/sec)
Did 4080 ECDSA P-384 signing operations in 3044588us (1340.1 ops/sec)
Did 3520 ECDSA P-384 verify operations in 3056070us (1151.8 ops/sec)
Did 992 ECDH P-521 operations in 3012779us (329.3 ops/sec)
Did 1792 ECDSA P-521 signing operations in 3019459us (593.5 ops/sec)
Did 1600 ECDSA P-521 verify operations in 3047749us (525.0 ops/sec)

Bug: 239
Change-Id: If5d13825fc98e4c58bdd1580cf0245bf7ce93a82
Reviewed-on: https://boringssl-review.googlesource.com/27004
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
diff --git a/crypto/fipsmodule/ec/ec.c b/crypto/fipsmodule/ec/ec.c
index 904466a..ee7ec55 100644
--- a/crypto/fipsmodule/ec/ec.c
+++ b/crypto/fipsmodule/ec/ec.c
@@ -792,9 +792,19 @@
     OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS);
     return 0;
   }
-  return ec_GFp_simple_add(group, r, a, b, ctx);
+  return ec_GFp_simple_add(group, r, a, b, 0 /* both Jacobian */, ctx);
 }
 
+int ec_point_add_mixed(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
+                       const EC_POINT *b, BN_CTX *ctx) {
+  if (EC_GROUP_cmp(group, r->group, NULL) != 0 ||
+      EC_GROUP_cmp(group, a->group, NULL) != 0 ||
+      EC_GROUP_cmp(group, b->group, NULL) != 0) {
+    OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS);
+    return 0;
+  }
+  return ec_GFp_simple_add(group, r, a, b, 1 /* |b| is affine */, ctx);
+}
 
 int EC_POINT_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
                  BN_CTX *ctx) {
diff --git a/crypto/fipsmodule/ec/internal.h b/crypto/fipsmodule/ec/internal.h
index 742e94e..c5d7291 100644
--- a/crypto/fipsmodule/ec/internal.h
+++ b/crypto/fipsmodule/ec/internal.h
@@ -193,6 +193,11 @@
 int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out,
                              const uint8_t additional_data[32]);
 
+// ec_point_add_mixed behaves like |EC_POINT_add|, but |&b->Z| must be zero or
+// one.
+int ec_point_add_mixed(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
+                       const EC_POINT *b, BN_CTX *ctx);
+
 // ec_point_mul_scalar sets |r| to generator * |g_scalar| + |p| *
 // |p_scalar|. Unlike other functions which take |EC_SCALAR|, |g_scalar| and
 // |p_scalar| need not be fully reduced. They need only contain as many bits as
@@ -238,7 +243,7 @@
                                                const BIGNUM *x, const BIGNUM *y,
                                                BN_CTX *);
 int ec_GFp_simple_add(const EC_GROUP *, EC_POINT *r, const EC_POINT *a,
-                      const EC_POINT *b, BN_CTX *);
+                      const EC_POINT *b, int mixed, BN_CTX *);
 int ec_GFp_simple_dbl(const EC_GROUP *, EC_POINT *r, const EC_POINT *a,
                       BN_CTX *);
 int ec_GFp_simple_invert(const EC_GROUP *, EC_POINT *, BN_CTX *);
diff --git a/crypto/fipsmodule/ec/simple.c b/crypto/fipsmodule/ec/simple.c
index e87409c..4fb394c 100644
--- a/crypto/fipsmodule/ec/simple.c
+++ b/crypto/fipsmodule/ec/simple.c
@@ -303,7 +303,11 @@
 }
 
 int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
-                      const EC_POINT *b, BN_CTX *ctx) {
+                      const EC_POINT *b, int mixed, BN_CTX *ctx) {
+  if (mixed) {
+    assert(BN_is_zero(&b->Z) || BN_cmp(&b->Z, &group->one) == 0);
+  }
+
   int (*field_mul)(const EC_GROUP *, BIGNUM *, const BIGNUM *, const BIGNUM *,
                    BN_CTX *);
   int (*field_sqr)(const EC_GROUP *, BIGNUM *, const BIGNUM *, BN_CTX *);
@@ -350,9 +354,7 @@
   // ('r' might be one of 'a' or 'b'.)
 
   // n1, n2
-  int b_Z_is_one = BN_cmp(&b->Z, &group->one) == 0;
-
-  if (b_Z_is_one) {
+  if (mixed) {
     if (!BN_copy(n1, &a->X) || !BN_copy(n2, &a->Y)) {
       goto end;
     }
@@ -373,26 +375,17 @@
   }
 
   // n3, n4
-  int a_Z_is_one = BN_cmp(&a->Z, &group->one) == 0;
-  if (a_Z_is_one) {
-    if (!BN_copy(n3, &b->X) || !BN_copy(n4, &b->Y)) {
-      goto end;
-    }
-    // n3 = X_b
-    // n4 = Y_b
-  } else {
-    if (!field_sqr(group, n0, &a->Z, ctx) ||
-        !field_mul(group, n3, &b->X, n0, ctx)) {
-      goto end;
-    }
-    // n3 = X_b * Z_a^2
-
-    if (!field_mul(group, n0, n0, &a->Z, ctx) ||
-        !field_mul(group, n4, &b->Y, n0, ctx)) {
-      goto end;
-    }
-    // n4 = Y_b * Z_a^3
+  if (!field_sqr(group, n0, &a->Z, ctx) ||
+      !field_mul(group, n3, &b->X, n0, ctx)) {
+    goto end;
   }
+  // n3 = X_b * Z_a^2
+
+  if (!field_mul(group, n0, n0, &a->Z, ctx) ||
+      !field_mul(group, n4, &b->Y, n0, ctx)) {
+    goto end;
+  }
+  // n4 = Y_b * Z_a^3
 
   // n5, n6
   if (!bn_mod_sub_consttime(n5, n1, n3, p, ctx) ||
@@ -426,25 +419,15 @@
   // 'n8' = n2 + n4
 
   // Z_r
-  if (a_Z_is_one && b_Z_is_one) {
-    if (!BN_copy(&r->Z, n5)) {
+  if (mixed) {
+    if (!BN_copy(n0, &a->Z)) {
       goto end;
     }
-  } else {
-    if (a_Z_is_one) {
-      if (!BN_copy(n0, &b->Z)) {
-        goto end;
-      }
-    } else if (b_Z_is_one) {
-      if (!BN_copy(n0, &a->Z)) {
-        goto end;
-      }
-    } else if (!field_mul(group, n0, &a->Z, &b->Z, ctx)) {
-      goto end;
-    }
-    if (!field_mul(group, &r->Z, n0, n5, ctx)) {
-      goto end;
-    }
+  } else if (!field_mul(group, n0, &a->Z, &b->Z, ctx)) {
+    goto end;
+  }
+  if (!field_mul(group, &r->Z, n0, n5, ctx)) {
+    goto end;
   }
 
   // Z_r = Z_a * Z_b * n5
@@ -534,15 +517,7 @@
   // ('r' might the same as 'a'.)
 
   // n1
-  if (BN_cmp(&a->Z, &group->one) == 0) {
-    if (!field_sqr(group, n0, &a->X, ctx) ||
-        !bn_mod_lshift1_consttime(n1, n0, p, ctx) ||
-        !bn_mod_add_consttime(n0, n0, n1, p, ctx) ||
-        !bn_mod_add_consttime(n1, n0, &group->a, p, ctx)) {
-      goto err;
-    }
-    // n1 = 3 * X_a^2 + a_curve
-  } else if (group->a_is_minus3) {
+  if (group->a_is_minus3) {
     if (!field_sqr(group, n1, &a->Z, ctx) ||
         !bn_mod_add_consttime(n0, &a->X, n1, p, ctx) ||
         !bn_mod_sub_consttime(n2, &a->X, n1, p, ctx) ||
@@ -567,14 +542,8 @@
   }
 
   // Z_r
-  if (BN_cmp(&a->Z, &group->one) == 0) {
-    if (!BN_copy(n0, &a->Y)) {
-      goto err;
-    }
-  } else if (!field_mul(group, n0, &a->Y, &a->Z, ctx)) {
-    goto err;
-  }
-  if (!bn_mod_lshift1_consttime(&r->Z, n0, p, ctx)) {
+  if (!field_mul(group, n0, &a->Y, &a->Z, ctx) ||
+      !bn_mod_lshift1_consttime(&r->Z, n0, p, ctx)) {
     goto err;
   }
   // Z_r = 2 * Y_a * Z_a
diff --git a/crypto/fipsmodule/ec/wnaf.c b/crypto/fipsmodule/ec/wnaf.c
index 7bc0bc7..49fc8bc 100644
--- a/crypto/fipsmodule/ec/wnaf.c
+++ b/crypto/fipsmodule/ec/wnaf.c
@@ -291,7 +291,9 @@
 
   tmp = EC_POINT_new(group);
   if (tmp == NULL ||
-      // |window_bits_for_scalar_size| assumes we do this step.
+      // Convert the points to affine coordinates. This allows us to use the
+      // slightly faster |ec_point_add_mixed|. The conversion itself is not
+      // cheap, but it is worthwhile when there are two points.
       !EC_POINTs_make_affine(group, total_precomp, precomp_storage, ctx)) {
     goto err;
   }
@@ -302,35 +304,31 @@
       goto err;
     }
 
-    if (g_scalar != NULL) {
-      if (g_wNAF[k] != 0) {
-        if (!lookup_precomp(group, tmp, g_precomp, g_wNAF[k], ctx)) {
+    if (g_scalar != NULL && g_wNAF[k] != 0) {
+      if (!lookup_precomp(group, tmp, g_precomp, g_wNAF[k], ctx)) {
+        goto err;
+      }
+      if (r_is_at_infinity) {
+        if (!EC_POINT_copy(r, tmp)) {
           goto err;
         }
-        if (r_is_at_infinity) {
-          if (!EC_POINT_copy(r, tmp)) {
-            goto err;
-          }
-          r_is_at_infinity = 0;
-        } else if (!EC_POINT_add(group, r, r, tmp, ctx)) {
-          goto err;
-        }
+        r_is_at_infinity = 0;
+      } else if (!ec_point_add_mixed(group, r, r, tmp, ctx)) {
+        goto err;
       }
     }
 
-    if (p_scalar != NULL) {
-      if (p_wNAF[k] != 0) {
-        if (!lookup_precomp(group, tmp, p_precomp, p_wNAF[k], ctx)) {
+    if (p_scalar != NULL && p_wNAF[k] != 0) {
+      if (!lookup_precomp(group, tmp, p_precomp, p_wNAF[k], ctx)) {
+        goto err;
+      }
+      if (r_is_at_infinity) {
+        if (!EC_POINT_copy(r, tmp)) {
           goto err;
         }
-        if (r_is_at_infinity) {
-          if (!EC_POINT_copy(r, tmp)) {
-            goto err;
-          }
-          r_is_at_infinity = 0;
-        } else if (!EC_POINT_add(group, r, r, tmp, ctx)) {
-          goto err;
-        }
+        r_is_at_infinity = 0;
+      } else if (!ec_point_add_mixed(group, r, r, tmp, ctx)) {
+        goto err;
       }
     }
   }