Optimize EC_GFp_mont_method's cmp_x_coordinate.

For simplicity, punt order > field or width mismatches. Analogous
optimizations are possible, but the generic path works fine and no
commonly-used curve looks hits those cases.

Before:
Did 5888 ECDSA P-384 verify operations in 3094535us (1902.7 ops/sec)
After [+6.7%]:
Did 6107 ECDSA P-384 verify operations in 3007515us (2030.6 ops/sec)

Also we can fill in p - order generically and avoid extra copies of some
constants.

Change-Id: I38e1b6d51b28ed4f8cb74697b00a4f0fbc5efc3c
Reviewed-on: https://boringssl-review.googlesource.com/c/33068
Commit-Queue: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/ec/ec.c b/crypto/fipsmodule/ec/ec.c
index a783b73..4420c08 100644
--- a/crypto/fipsmodule/ec/ec.c
+++ b/crypto/fipsmodule/ec/ec.c
@@ -344,9 +344,8 @@
       generator->group != group) {
     // |EC_GROUP_set_generator| may only be used with |EC_GROUP|s returned by
     // |EC_GROUP_new_curve_GFp| and may only used once on each group.
-    // Additionally, |generator| must been created from
-    // |EC_GROUP_new_curve_GFp|, not a copy, so that
-    // |generator->group->generator| is set correctly.
+    // |generator| must have been created from |EC_GROUP_new_curve_GFp|, not a
+    // copy, so that |generator->group->generator| is set correctly.
     OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
     return 0;
   }
@@ -367,25 +366,23 @@
   // Note any curve which did not satisfy this must have been invalid or use a
   // tiny prime (less than 17). See the proof in |field_element_to_scalar| in
   // the ECDSA implementation.
+  int ret = 0;
+  EC_POINT *copy = NULL;
   BIGNUM *tmp = BN_new();
   if (tmp == NULL ||
       !BN_lshift1(tmp, order)) {
-    BN_free(tmp);
-    return 0;
+    goto err;
   }
-  int ok = BN_cmp(tmp, &group->field) > 0;
-  BN_free(tmp);
-  if (!ok) {
+  if (BN_cmp(tmp, &group->field) <= 0) {
     OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER);
-    return 0;
+    goto err;
   }
 
-  EC_POINT *copy = EC_POINT_new(group);
+  copy = EC_POINT_new(group);
   if (copy == NULL ||
       !EC_POINT_copy(copy, generator) ||
       !BN_copy(&group->order, order)) {
-    EC_POINT_free(copy);
-    return 0;
+    goto err;
   }
   // Store the order in minimal form, so it can be used with |BN_ULONG| arrays.
   bn_set_minimal_width(&group->order);
@@ -393,11 +390,26 @@
   BN_MONT_CTX_free(group->order_mont);
   group->order_mont = BN_MONT_CTX_new_for_modulus(&group->order, NULL);
   if (group->order_mont == NULL) {
-    return 0;
+    goto err;
+  }
+
+  group->field_greater_than_order = BN_cmp(&group->field, &group->order) > 0;
+  if (group->field_greater_than_order) {
+    if (!BN_sub(tmp, &group->field, &group->order) ||
+        !bn_copy_words(group->field_minus_order.words, group->field.width,
+                       tmp)) {
+      goto err;
+    }
   }
 
   ec_group_set0_generator(group, copy);
-  return 1;
+  copy = NULL;
+  ret = 1;
+
+err:
+  EC_POINT_free(copy);
+  BN_free(tmp);
+  return ret;
 }
 
 static EC_GROUP *ec_group_new_from_data(const struct built_in_curve *curve) {
@@ -449,6 +461,14 @@
     goto err;
   }
 
+  group->field_greater_than_order = BN_cmp(&group->field, &group->order) > 0;
+  if (group->field_greater_than_order) {
+    if (!BN_sub(p, &group->field, &group->order) ||
+        !bn_copy_words(group->field_minus_order.words, group->field.width, p)) {
+      goto err;
+    }
+  }
+
   group->order_mont = BN_MONT_CTX_new_for_modulus(&group->order, ctx);
   if (group->order_mont == NULL) {
     OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
diff --git a/crypto/fipsmodule/ec/ec_montgomery.c b/crypto/fipsmodule/ec/ec_montgomery.c
index 4961a7c..caa1966 100644
--- a/crypto/fipsmodule/ec/ec_montgomery.c
+++ b/crypto/fipsmodule/ec/ec_montgomery.c
@@ -417,6 +417,51 @@
   }
 }
 
+static int ec_GFp_mont_cmp_x_coordinate(const EC_GROUP *group,
+                                        const EC_RAW_POINT *p,
+                                        const EC_SCALAR *r) {
+  if (!group->field_greater_than_order ||
+      group->field.width != group->order.width) {
+    // Do not bother optimizing this case. p > order in all commonly-used
+    // curves.
+    return ec_GFp_simple_cmp_x_coordinate(group, p, r);
+  }
+
+  if (ec_GFp_simple_is_at_infinity(group, p)) {
+    return 0;
+  }
+
+  // We wish to compare X/Z^2 with r. This is equivalent to comparing X with
+  // r*Z^2. Note that X and Z are represented in Montgomery form, while r is
+  // not.
+  EC_FELEM r_Z2, Z2_mont, X;
+  ec_GFp_mont_felem_mul(group, &Z2_mont, &p->Z, &p->Z);
+  // r < order < p, so this is valid.
+  OPENSSL_memcpy(r_Z2.words, r->words, group->field.width * sizeof(BN_ULONG));
+  ec_GFp_mont_felem_mul(group, &r_Z2, &r_Z2, &Z2_mont);
+  ec_GFp_mont_felem_from_montgomery(group, &X, &p->X);
+
+  if (ec_felem_equal(group, &r_Z2, &X)) {
+    return 1;
+  }
+
+  // During signing the x coefficient is reduced modulo the group order.
+  // Therefore there is a small possibility, less than 1/2^128, that group_order
+  // < p.x < P. in that case we need not only to compare against |r| but also to
+  // compare against r+group_order.
+  if (bn_less_than_words(r->words, group->field_minus_order.words,
+                         group->field.width)) {
+    // We can ignore the carry because: r + group_order < p < 2^256.
+    bn_add_words(r_Z2.words, r->words, group->order.d, group->field.width);
+    ec_GFp_mont_felem_mul(group, &r_Z2, &r_Z2, &Z2_mont);
+    if (ec_felem_equal(group, &r_Z2, &X)) {
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_mont_method) {
   out->group_init = ec_GFp_mont_group_init;
   out->group_finish = ec_GFp_mont_group_finish;
@@ -432,5 +477,5 @@
   out->felem_to_bignum = ec_GFp_mont_felem_to_bignum;
   out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
   out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
-  out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
+  out->cmp_x_coordinate = ec_GFp_mont_cmp_x_coordinate;
 }
diff --git a/crypto/fipsmodule/ec/internal.h b/crypto/fipsmodule/ec/internal.h
index 89e945c..be548a3 100644
--- a/crypto/fipsmodule/ec/internal.h
+++ b/crypto/fipsmodule/ec/internal.h
@@ -213,7 +213,20 @@
 
   EC_FELEM a, b;  // Curve coefficients.
 
-  int a_is_minus3;  // enable optimized point arithmetics for special case
+  // a_is_minus3 is one if |a| is -3 mod |field| and zero otherwise. Point
+  // arithmetic is optimized for -3.
+  int a_is_minus3;
+
+  // field_greater_than_order is one if |field| is greate than |order| and zero
+  // otherwise.
+  int field_greater_than_order;
+
+  // field_minus_order, if |field_greater_than_order| is true, is |field| minus
+  // |order| represented as an |EC_FELEM|. Otherwise, it is zero.
+  //
+  // Note: unlike |EC_FELEM|s used as intermediate values internal to the
+  // |EC_METHOD|, this value is not encoded in Montgomery form.
+  EC_FELEM field_minus_order;
 
   CRYPTO_refcount_t references;
 
diff --git a/crypto/fipsmodule/ec/p256-x86_64.c b/crypto/fipsmodule/ec/p256-x86_64.c
index 3053e2d..c5f2d1c 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/crypto/fipsmodule/ec/p256-x86_64.c
@@ -44,12 +44,6 @@
     TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe),
 };
 
-// P256_ORDER is the order of the P-256 group, not in Montgomery form.
-static const BN_ULONG P256_ORDER[P256_LIMBS] = {
-    TOBN(0xf3b9cac2, 0xfc632551), TOBN(0xbce6faad, 0xa7179e84),
-    TOBN(0xffffffff, 0xffffffff), TOBN(0xffffffff, 0x00000000),
-};
-
 // Precomputed tables for the default generator
 #include "p256-x86_64-table.h"
 
@@ -591,7 +585,8 @@
     return ec_GFp_simple_mont_inv_mod_ord_vartime(group, out, in);
   }
 
-  if (!beeu_mod_inverse_vartime(out->words, in->words, P256_ORDER)) {
+  assert(group->order.width == P256_LIMBS);
+  if (!beeu_mod_inverse_vartime(out->words, in->words, group->order.d)) {
     return 0;
   }
 
@@ -607,6 +602,9 @@
     return 0;
   }
 
+  assert(group->order.width == P256_LIMBS);
+  assert(group->field.width == P256_LIMBS);
+
   // We wish to compare X/Z^2 with r. This is equivalent to comparing X with
   // r*Z^2. Note that X and Z are represented in Montgomery form, while r is
   // not.
@@ -623,16 +621,10 @@
   // Therefore there is a small possibility, less than 1/2^128, that group_order
   // < p.x < P. in that case we need not only to compare against |r| but also to
   // compare against r+group_order.
-
-  // P_MINUS_ORDER is the difference between the field order (p) and the group
-  // order (N). This value is not in the Montgomery domain.
-  static const BN_ULONG P_MINUS_ORDER[P256_LIMBS] = {
-      TOBN(0x0c46353d, 0x039cdaae), TOBN(0x43190553, 0x58e8617b),
-      TOBN(0x00000000, 0x00000000), TOBN(0x00000000, 0x00000000)};
-
-  if (bn_less_than_words(r->words, P_MINUS_ORDER, P256_LIMBS)) {
+  if (bn_less_than_words(r->words, group->field_minus_order.words,
+                         P256_LIMBS)) {
     // We can ignore the carry because: r + group_order < p < 2^256.
-    bn_add_words(r_Z2, r->words, P256_ORDER, P256_LIMBS);
+    bn_add_words(r_Z2, r->words, group->order.d, P256_LIMBS);
     ecp_nistz256_mul_mont(r_Z2, r_Z2, Z2_mont);
     if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) {
       return 1;
diff --git a/third_party/fiat/p256.c b/third_party/fiat/p256.c
index 638f681..893c9d4 100644
--- a/third_party/fiat/p256.c
+++ b/third_party/fiat/p256.c
@@ -1838,15 +1838,9 @@
   // Therefore there is a small possibility, less than 1/2^128, that group_order
   // < p.x < P. in that case we need not only to compare against |r| but also to
   // compare against r+group_order.
-
-  // P_MINUS_ORDER is the difference between the field order (p) and the group
-  // order (N). This value is not in the Montgomery domain.
-  static const BN_ULONG P_MINUS_ORDER[] = {
-      TOBN(0x0c46353d, 0x039cdaae), TOBN(0x43190553, 0x58e8617b),
-      TOBN(0x00000000, 0x00000000), TOBN(0x00000000, 0x00000000)};
-  assert(OPENSSL_ARRAY_SIZE(P_MINUS_ORDER) == group->order.width);
-  if (bn_less_than_words(r->words, P_MINUS_ORDER,
-                         OPENSSL_ARRAY_SIZE(P_MINUS_ORDER))) {
+  assert(group->field.width == group->order.width);
+  if (bn_less_than_words(r->words, group->field_minus_order.words,
+                         group->field.width)) {
     // We can ignore the carry because: r + group_order < p < 2^256.
     EC_FELEM tmp;
     bn_add_words(tmp.words, r->words, group->order.d, group->order.width);