Optimize EC_GFp_nistp256_method's cmp_x_coordinate.

Before:
Did 35496 ECDSA P-256 verify operations in 10027999us (3539.7 ops/sec)
After [+6.9%]:
Did 38170 ECDSA P-256 verify operations in 10090160us (3782.9 ops/sec)

Change-Id: Ib272d19954f46d96efc2b6d5dd480b5b85a34523
Reviewed-on: https://boringssl-review.googlesource.com/c/33067
Commit-Queue: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/third_party/fiat/p256.c b/third_party/fiat/p256.c
index c8e42a3..638f681 100644
--- a/third_party/fiat/p256.c
+++ b/third_party/fiat/p256.c
@@ -35,6 +35,7 @@
 #include <openssl/mem.h>
 #include <openssl/type_check.h>
 
+#include <assert.h>
 #include <string.h>
 
 #include "../../crypto/fipsmodule/delocate.h"
@@ -1807,6 +1808,58 @@
   fe_to_generic(&r->Z, ret[2]);
 }
 
+static int ec_GFp_nistp256_cmp_x_coordinate(const EC_GROUP *group,
+                                            const EC_RAW_POINT *p,
+                                            const EC_SCALAR *r) {
+  if (ec_GFp_simple_is_at_infinity(group, p)) {
+    return 0;
+  }
+
+  // We wish to compare X/Z^2 with r. This is equivalent to comparing X with
+  // r*Z^2. Note that X and Z are represented in Montgomery form, while r is
+  // not.
+  fe Z2_mont;
+  fe_from_generic(Z2_mont, &p->Z);
+  fe_mul(Z2_mont, Z2_mont, Z2_mont);
+
+  fe r_Z2;
+  fe_frombytes(r_Z2, r->bytes);  // r < order < p, so this is valid.
+  fe_mul(r_Z2, r_Z2, Z2_mont);
+
+  fe X;
+  fe_from_generic(X, &p->X);
+  fe_from_montgomery(X);
+
+  if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
+    return 1;
+  }
+
+  // During signing the x coefficient is reduced modulo the group order.
+  // Therefore there is a small possibility, less than 1/2^128, that group_order
+  // < p.x < P. in that case we need not only to compare against |r| but also to
+  // compare against r+group_order.
+
+  // P_MINUS_ORDER is the difference between the field order (p) and the group
+  // order (N). This value is not in the Montgomery domain.
+  static const BN_ULONG P_MINUS_ORDER[] = {
+      TOBN(0x0c46353d, 0x039cdaae), TOBN(0x43190553, 0x58e8617b),
+      TOBN(0x00000000, 0x00000000), TOBN(0x00000000, 0x00000000)};
+  assert(OPENSSL_ARRAY_SIZE(P_MINUS_ORDER) == group->order.width);
+  if (bn_less_than_words(r->words, P_MINUS_ORDER,
+                         OPENSSL_ARRAY_SIZE(P_MINUS_ORDER))) {
+    // We can ignore the carry because: r + group_order < p < 2^256.
+    EC_FELEM tmp;
+    bn_add_words(tmp.words, r->words, group->order.d, group->order.width);
+    fe_from_generic(r_Z2, &tmp);
+    fe_mul(r_Z2, r_Z2, Z2_mont);
+    if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp256_method) {
   out->group_init = ec_GFp_mont_group_init;
   out->group_finish = ec_GFp_mont_group_finish;
@@ -1823,7 +1876,7 @@
   out->felem_to_bignum = ec_GFp_mont_felem_to_bignum;
   out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
   out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
-  out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
+  out->cmp_x_coordinate = ec_GFp_nistp256_cmp_x_coordinate;
 };
 
 #undef BORINGSSL_NISTP256_64BIT