Speed up ECDSA verify on x86-64.

This commit improves the performance of ECDSA signature verification
(over NIST P-256 curve) for x86 platforms. The speedup is by a factor of 1.15x.
It does so by:
  1) Leveraging the fact that the verification does not need
     to run in constant time. To this end, we implemented:
    a) the function ecp_nistz256_points_mul_public in a similar way to
       the current ecp_nistz256_points_mul function by removing its constant
       time features.
    b) the Binary Extended Euclidean Algorithm (BEEU) in x86 assembly to
       replace the current modular inverse function used for the inversion.
  2) The last step in the ECDSA_verify function compares the (x) affine
     coordinate with the signature (r) value. Converting x from the Jacobian's
     representation to the affine coordinate requires to perform one inversions
     (x_affine = x * z^(-2)). We save this inversion and speed up the computations
     by instead bringing r to x (r_jacobian = r*z^2) which is faster.

The measured results are:
Before (on a Kaby Lake desktop with gcc-5):
Did 26000 ECDSA P-224 signing operations in 1002372us (25938.5 ops/sec)
Did 11000 ECDSA P-224 verify operations in 1043821us (10538.2 ops/sec)
Did 55000 ECDSA P-256 signing operations in 1017560us (54050.9 ops/sec)
Did 17000 ECDSA P-256 verify operations in 1051280us (16170.8 ops/sec)

After (on a Kaby Lake desktop with gcc-5):
Did 27000 ECDSA P-224 signing operations in 1011287us (26698.7 ops/sec)
Did 11640 ECDSA P-224 verify operations in 1076698us (10810.8 ops/sec)
Did 55000 ECDSA P-256 signing operations in 1016880us (54087.0 ops/sec)
Did 20000 ECDSA P-256 verify operations in 1038736us (19254.2 ops/sec)

Before (on a Skylake server platform with gcc-5):
Did 25000 ECDSA P-224 signing operations in 1021651us (24470.2 ops/sec)
Did 10373 ECDSA P-224 verify operations in 1046563us (9911.5 ops/sec)
Did 50000 ECDSA P-256 signing operations in 1002774us (49861.7 ops/sec)
Did 15000 ECDSA P-256 verify operations in 1006471us (14903.6 ops/sec)

After (on a Skylake server platform with gcc-5):
Did 25000 ECDSA P-224 signing operations in 1020958us (24486.8 ops/sec)
Did 10373 ECDSA P-224 verify operations in 1046359us (9913.4 ops/sec)
Did 50000 ECDSA P-256 signing operations in 1003996us (49801.0 ops/sec)
Did 18000 ECDSA P-256 verify operations in 1021604us (17619.4 ops/sec)

Developers and authors:
***************************************************************************
Nir Drucker (1,2), Shay Gueron (1,2)
(1) Amazon Web Services Inc.
(2) University of Haifa, Israel
***************************************************************************

Change-Id: Idd42a7bc40626bce974ea000b61fdb5bad33851c
Reviewed-on: https://boringssl-review.googlesource.com/c/31304
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index e6c8cc6..463febb 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@@ -11,6 +11,7 @@
     ghash-x86_64.${ASM_EXT}
     md5-x86_64.${ASM_EXT}
     p256-x86_64-asm.${ASM_EXT}
+    p256_beeu-x86_64-asm.${ASM_EXT}
     rdrand-x86_64.${ASM_EXT}
     rsaz-avx2.${ASM_EXT}
     sha1-x86_64.${ASM_EXT}
@@ -100,6 +101,7 @@
 perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl)
 perlasm(md5-x86_64.${ASM_EXT} md5/asm/md5-x86_64.pl)
 perlasm(p256-x86_64-asm.${ASM_EXT} ec/asm/p256-x86_64-asm.pl)
+perlasm(p256_beeu-x86_64-asm.${ASM_EXT} ec/asm/p256_beeu-x86_64-asm.pl)
 perlasm(rdrand-x86_64.${ASM_EXT} rand/asm/rdrand-x86_64.pl)
 perlasm(rsaz-avx2.${ASM_EXT} bn/asm/rsaz-avx2.pl)
 perlasm(sha1-586.${ASM_EXT} sha/asm/sha1-586.pl)
diff --git a/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl b/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl
new file mode 100644
index 0000000..12b9f5a
--- /dev/null
+++ b/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl
@@ -0,0 +1,405 @@
+# Copyright (c) 2018, Amazon Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#
+# Written by Nir Drucker, and Shay Gueron
+# AWS Cryptographic Algorithms Group
+# (ndrucker@amazon.com, gueron@amazon.com)
+# based on BN_mod_inverse_odd
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+#############################################################################
+# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
+#                                     BN_ULONG a[P256_LIMBS],
+#                                     BN_ULONG n[P256_LIMBS]);
+#
+# (Binary Extended Euclidean Algorithm.
+#  See https://en.wikipedia.org/wiki/Binary_GCD_algorithm)
+#
+# Assumption 1: n is odd for the BEEU
+# Assumption 2: 1 < a < n < 2^256
+
+$out = "%rdi";
+$a = "%rsi";
+$n = "%rdx";
+
+# X/Y will hold the inverse parameter
+# Assumption: X,Y<2^(256)
+$x0 = "%r8";
+$x1 = "%r9";
+$x2 = "%r10";
+$x3 = "%r11";
+# borrow from out (out is needed only at the end)
+$x4 = "%rdi";
+$y0 = "%r12";
+$y1 = "%r13";
+$y2 = "%r14";
+$y3 = "%r15";
+$y4 = "%rbp";
+$shift = "%rcx";
+$t0 = "%rax";
+$t1 = "%rbx";
+$t2 = "%rsi";
+# borrow
+$t3 = "%rcx";
+
+$T0 = "%xmm0";
+$T1 = "%xmm1";
+
+# Offsets on the stack
+$out_rsp = 0;
+$shift_rsp = $out_rsp+0x8;
+$a_rsp0 = $shift_rsp+0x8;
+$a_rsp1 = $a_rsp0+0x8;
+$a_rsp2 = $a_rsp1+0x8;
+$a_rsp3 = $a_rsp2+0x8;
+$b_rsp0 = $a_rsp3+0x8;
+$b_rsp1 = $b_rsp0+0x8;
+$b_rsp2 = $b_rsp1+0x8;
+$b_rsp3 = $b_rsp2+0x8;
+
+# Borrow when a_rsp/b_rsp are no longer needed.
+$y_rsp0 = $a_rsp0;
+$y_rsp1 = $y_rsp0+0x8;
+$y_rsp2 = $y_rsp1+0x8;
+$y_rsp3 = $y_rsp2+0x8;
+$y_rsp4 = $y_rsp3+0x8;
+$last_rsp_offset = $b_rsp3+0x8;
+
+sub TEST_B_ZERO {
+  return <<___;
+    xorq $t1, $t1
+    or $b_rsp0(%rsp), $t1
+    or $b_rsp1(%rsp), $t1
+    or $b_rsp2(%rsp), $t1
+    or $b_rsp3(%rsp), $t1
+    jz .Lbeeu_loop_end
+___
+}
+
+$g_next_label = 0;
+
+sub SHIFT1 {
+  my ($var0, $var1, $var2, $var3, $var4) = @_;
+  my $label = ".Lshift1_${g_next_label}";
+  $g_next_label++;
+
+  return <<___;
+    # Ensure X is even and divide by two.
+    movq \$1, $t1
+    andq $var0, $t1
+    jz $label
+    add 0*8($n), $var0
+    adc 1*8($n), $var1
+    adc 2*8($n), $var2
+    adc 3*8($n), $var3
+    adc \$0, $var4
+
+$label:
+    shrdq \$1, $var1, $var0
+    shrdq \$1, $var2, $var1
+    shrdq \$1, $var3, $var2
+    shrdq \$1, $var4, $var3
+    shrq  \$1, $var4
+___
+}
+
+sub SHIFT256 {
+  my ($var) = @_;
+  return <<___;
+    # Copy shifted values.
+    # Remember not to override t3=rcx
+    movq 1*8+$var(%rsp), $t0
+    movq 2*8+$var(%rsp), $t1
+    movq 3*8+$var(%rsp), $t2
+
+    shrdq %cl, $t0, 0*8+$var(%rsp)
+    shrdq %cl, $t1, 1*8+$var(%rsp)
+    shrdq %cl, $t2, 2*8+$var(%rsp)
+
+    shrq  %cl, $t2
+    mov $t2, 3*8+$var(%rsp)
+___
+}
+
+$code.=<<___;
+.text
+
+.type beeu_mod_inverse_vartime,\@function
+.hidden beeu_mod_inverse_vartime
+.globl  beeu_mod_inverse_vartime
+.align 32
+beeu_mod_inverse_vartime:
+.cfi_startproc
+    push %rbp
+.cfi_push rbp
+    movq %rsp, %rbp
+.cfi_def_cfa_register rbp
+
+    push %r12
+.cfi_push r12
+    push %r13
+.cfi_push r13
+    push %r14
+.cfi_push r14
+    push %r15
+.cfi_push r15
+    push %rbx
+.cfi_push rbx
+    push %rsi
+.cfi_push rsi
+
+    sub \$$last_rsp_offset, %rsp
+    movq $out, $out_rsp(%rsp)
+
+    # X=1, Y=0
+    movq \$1, $x0
+    xorq $x1, $x1
+    xorq $x2, $x2
+    xorq $x3, $x3
+    xorq $x4, $x4
+
+    xorq $y0, $y0
+    xorq $y1, $y1
+    xorq $y2, $y2
+    xorq $y3, $y3
+    xorq $y4, $y4
+
+    # Copy a/n into B/A on the stack.
+    vmovdqu 0*8($a), $T0
+    vmovdqu 2*8($a), $T1
+    vmovdqu $T0, $b_rsp0(%rsp)
+    vmovdqu $T1, $b_rsp2(%rsp)
+
+    vmovdqu 0*8($n), $T0
+    vmovdqu 2*8($n), $T1
+    vmovdqu $T0, $a_rsp0(%rsp)
+    vmovdqu $T1, $a_rsp2(%rsp)
+
+.Lbeeu_loop:
+    ${\TEST_B_ZERO}
+
+    # 0 < B < |n|,
+    # 0 < A <= |n|,
+    # (1)      X*a  ==  B   (mod |n|),
+    # (2) (-1)*Y*a  ==  A   (mod |n|)
+
+    # Now divide B by the maximum possible power of two in the
+    # integers, and divide X by the same value mod |n|. When we're
+    # done, (1) still holds.
+    movq \$1, $shift
+
+    # Note that B > 0
+.Lbeeu_shift_loop_XB:
+    movq $shift, $t1
+    andq $b_rsp0(%rsp), $t1
+    jnz .Lbeeu_shift_loop_end_XB
+
+    ${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
+    shl \$1, $shift
+
+    # Test wraparound of the shift parameter. The probability to have 32 zeroes
+    # in a row is small Therefore having the value below equal \$0x8000000 or
+    # \$0x8000 does not affect the performance. We choose 0x8000000 because it
+    # is the maximal immediate value possible.
+    cmp \$0x8000000, $shift
+    jne .Lbeeu_shift_loop_XB
+
+.Lbeeu_shift_loop_end_XB:
+    bsf $shift, $shift
+    test $shift, $shift
+    jz .Lbeeu_no_shift_XB
+
+    ${\SHIFT256($b_rsp0)}
+
+.Lbeeu_no_shift_XB:
+    # Same for A and Y.  Afterwards, (2) still holds.
+    movq \$1, $shift
+
+    # Note that A > 0
+.Lbeeu_shift_loop_YA:
+    movq $shift, $t1
+    andq $a_rsp0(%rsp), $t1
+    jnz .Lbeeu_shift_loop_end_YA
+
+    ${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
+    shl \$1, $shift
+
+    # Test wraparound of the shift parameter. The probability to have 32 zeroes
+    # in a row is small therefore having the value below equal \$0x8000000 or
+    # \$0x8000 Does not affect the performance. We choose 0x8000000 because it
+    # is the maximal immediate value possible.
+    cmp \$0x8000000, $shift
+    jne .Lbeeu_shift_loop_YA
+
+.Lbeeu_shift_loop_end_YA:
+    bsf $shift, $shift
+    test $shift, $shift
+    jz .Lbeeu_no_shift_YA
+
+    ${\SHIFT256($a_rsp0)}
+
+.Lbeeu_no_shift_YA:
+    # T = B-A (A,B < 2^256)
+    mov $b_rsp0(%rsp), $t0
+    mov $b_rsp1(%rsp), $t1
+    mov $b_rsp2(%rsp), $t2
+    mov $b_rsp3(%rsp), $t3
+    sub $a_rsp0(%rsp), $t0
+    sbb $a_rsp1(%rsp), $t1
+    sbb $a_rsp2(%rsp), $t2
+    sbb $a_rsp3(%rsp), $t3  # borrow from shift
+    jnc .Lbeeu_B_bigger_than_A
+
+    # A = A - B
+    mov $a_rsp0(%rsp), $t0
+    mov $a_rsp1(%rsp), $t1
+    mov $a_rsp2(%rsp), $t2
+    mov $a_rsp3(%rsp), $t3
+    sub $b_rsp0(%rsp), $t0
+    sbb $b_rsp1(%rsp), $t1
+    sbb $b_rsp2(%rsp), $t2
+    sbb $b_rsp3(%rsp), $t3
+    mov $t0, $a_rsp0(%rsp)
+    mov $t1, $a_rsp1(%rsp)
+    mov $t2, $a_rsp2(%rsp)
+    mov $t3, $a_rsp3(%rsp)
+
+    # Y = Y + X
+    add $x0, $y0
+    adc $x1, $y1
+    adc $x2, $y2
+    adc $x3, $y3
+    adc $x4, $y4
+    jmp .Lbeeu_loop
+
+.Lbeeu_B_bigger_than_A:
+    # B = T = B - A
+    mov $t0, $b_rsp0(%rsp)
+    mov $t1, $b_rsp1(%rsp)
+    mov $t2, $b_rsp2(%rsp)
+    mov $t3, $b_rsp3(%rsp)
+
+    # X = Y + X
+    add $y0, $x0
+    adc $y1, $x1
+    adc $y2, $x2
+    adc $y3, $x3
+    adc $y4, $x4
+
+    jmp .Lbeeu_loop
+
+.Lbeeu_loop_end:
+    # The Euclid's algorithm loop ends when A == beeu(a,n);
+    # Therefore (-1)*Y*a == A (mod |n|), Y>0
+
+    # Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
+    mov $a_rsp0(%rsp), $t1
+    sub \$1, $t1
+    or $a_rsp1(%rsp), $t1
+    or $a_rsp2(%rsp), $t1
+    or $a_rsp3(%rsp), $t1
+    # If not, fail.
+    jnz .Lbeeu_err
+
+    # From this point on, we no longer need X
+    # Therefore we use it as a temporary storage.
+    # X = n
+    movq 0*8($n), $x0
+    movq 1*8($n), $x1
+    movq 2*8($n), $x2
+    movq 3*8($n), $x3
+    xorq $x4, $x4
+
+.Lbeeu_reduction_loop:
+    movq $y0, $y_rsp0(%rsp)
+    movq $y1, $y_rsp1(%rsp)
+    movq $y2, $y_rsp2(%rsp)
+    movq $y3, $y_rsp3(%rsp)
+    movq $y4, $y_rsp4(%rsp)
+
+    # If Y>n ==> Y=Y-n
+    sub $x0, $y0
+    sbb $x1, $y1
+    sbb $x2, $y2
+    sbb $x3, $y3
+    sbb \$0, $y4
+
+    # Choose old Y or new Y
+    cmovc $y_rsp0(%rsp), $y0
+    cmovc $y_rsp1(%rsp), $y1
+    cmovc $y_rsp2(%rsp), $y2
+    cmovc $y_rsp3(%rsp), $y3
+    jnc .Lbeeu_reduction_loop
+
+    # X = n - Y (n, Y < 2^256), (Cancel the (-1))
+    sub $y0, $x0
+    sbb $y1, $x1
+    sbb $y2, $x2
+    sbb $y3, $x3
+
+.Lbeeu_save:
+    # Save the inverse(<2^256) to out.
+    mov $out_rsp(%rsp), $out
+
+    movq $x0, 0*8($out)
+    movq $x1, 1*8($out)
+    movq $x2, 2*8($out)
+    movq $x3, 3*8($out)
+
+    # Return 1.
+    movq \$1, %rax
+    jmp .Lbeeu_finish
+
+.Lbeeu_err:
+    # Return 0.
+    xorq %rax, %rax
+
+.Lbeeu_finish:
+    add \$$last_rsp_offset, %rsp
+    pop %rsi
+.cfi_pop rsi
+    pop %rbx
+.cfi_pop rbx
+    pop %r15
+.cfi_pop r15
+    pop %r14
+.cfi_pop r14
+    pop %r13
+.cfi_pop r13
+    pop %r12
+.cfi_pop r12
+    pop %rbp
+.cfi_pop rbp
+.cfi_def_cfa rsp, 8
+.cfi_endproc
+    ret
+
+.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/fipsmodule/ec/ec.c b/crypto/fipsmodule/ec/ec.c
index 908e35e..e9b1e97 100644
--- a/crypto/fipsmodule/ec/ec.c
+++ b/crypto/fipsmodule/ec/ec.c
@@ -911,6 +911,43 @@
   return 1;
 }
 
+int ec_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p,
+                        const BIGNUM *r, BN_CTX *ctx) {
+  return group->meth->cmp_x_coordinate(group, p, r, ctx);
+}
+
+int ec_field_element_to_scalar(const EC_GROUP *group, BIGNUM *r) {
+  // We must have p < 2×order, assuming p is not tiny (p >= 17). Thus rather we
+  // can reduce by performing at most one subtraction.
+  //
+  // Proof: We only work with prime order curves, so the number of points on
+  // the curve is the order. Thus Hasse's theorem gives:
+  //
+  //     |order - (p + 1)| <= 2×sqrt(p)
+  //         p + 1 - order <= 2×sqrt(p)
+  //     p + 1 - 2×sqrt(p) <= order
+  //       p + 1 - 2×(p/4)  < order       (p/4 > sqrt(p) for p >= 17)
+  //         p/2 < p/2 + 1  < order
+  //                     p  < 2×order
+  //
+  // Additionally, one can manually check this property for built-in curves. It
+  // is enforced for legacy custom curves in |EC_GROUP_set_generator|.
+  //
+  // TODO(davidben): Introduce |EC_FIELD_ELEMENT|, make this a function from
+  // |EC_FIELD_ELEMENT| to |EC_SCALAR|, and cut out the |BIGNUM|. Does this need
+  // to be constant-time for signing? |r| is the x-coordinate for kG, which is
+  // public unless k was rerolled because |s| was zero.
+  assert(!BN_is_negative(r));
+  assert(BN_cmp(r, &group->field) < 0);
+  if (BN_cmp(r, &group->order) >= 0 &&
+      !BN_sub(r, r, &group->order)) {
+    return 0;
+  }
+  assert(!BN_is_negative(r));
+  assert(BN_cmp(r, &group->order) < 0);
+  return 1;
+}
+
 void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag) {}
 
 const EC_METHOD *EC_GROUP_method_of(const EC_GROUP *group) {
diff --git a/crypto/fipsmodule/ec/ec_montgomery.c b/crypto/fipsmodule/ec/ec_montgomery.c
index 9eace95..cd6bbfd 100644
--- a/crypto/fipsmodule/ec/ec_montgomery.c
+++ b/crypto/fipsmodule/ec/ec_montgomery.c
@@ -232,4 +232,6 @@
   out->bignum_to_felem = ec_GFp_mont_bignum_to_felem;
   out->felem_to_bignum = ec_GFp_mont_felem_to_bignum;
   out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
+  out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
+  out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
 }
diff --git a/crypto/fipsmodule/ec/internal.h b/crypto/fipsmodule/ec/internal.h
index bb172b2..02cd04a 100644
--- a/crypto/fipsmodule/ec/internal.h
+++ b/crypto/fipsmodule/ec/internal.h
@@ -162,11 +162,21 @@
   int (*felem_to_bignum)(const EC_GROUP *group, BIGNUM *out,
                          const EC_FELEM *in);
 
-  // scalar_inv_mont sets |out| to |in|^-1, where both input and output are in
-  // Montgomery form.
+  // scalar_inv_montgomery sets |out| to |in|^-1, where both input and output
+  // are in Montgomery form.
   void (*scalar_inv_montgomery)(const EC_GROUP *group, EC_SCALAR *out,
                                 const EC_SCALAR *in);
 
+  // scalar_inv_montgomery_vartime performs the same computation as
+  // |scalar_inv_montgomery|. It further assumes that the inputs are public so
+  // there is no concern about leaking their values through timing.
+  int (*scalar_inv_montgomery_vartime)(const EC_GROUP *group, EC_SCALAR *out,
+                                       const EC_SCALAR *in);
+
+  // cmp_x_coordinate compares the x (affine) coordinate of |p|, mod the group
+  // order, with |r|. It returns one if they are equal and zero otherwise.
+  int (*cmp_x_coordinate)(const EC_GROUP *group, const EC_POINT *p,
+                          const BIGNUM *r, BN_CTX *ctx);
 } /* EC_METHOD */;
 
 const EC_METHOD *EC_GFp_mont_method(void);
@@ -272,6 +282,11 @@
 void ec_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
                               const EC_SCALAR *a);
 
+// ec_scalar_inv_montgomery_vartime performs the same actions as
+// |ec_scalar_inv_montgomery|, but in variable time.
+int ec_scalar_inv_montgomery_vartime(const EC_GROUP *group, EC_SCALAR *r,
+                                     const EC_SCALAR *a);
+
 // ec_point_mul_scalar sets |r| to generator * |g_scalar| + |p| *
 // |p_scalar|. Unlike other functions which take |EC_SCALAR|, |g_scalar| and
 // |p_scalar| need not be fully reduced. They need only contain as many bits as
@@ -287,6 +302,16 @@
     const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
     const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);
 
+// ec_cmp_x_coordinate compares the x (affine) coordinate of |p| with |r|. It
+// returns one if they are equal and zero otherwise. The |ctx| must have been
+// started by the caller.
+int ec_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p,
+                        const BIGNUM *r, BN_CTX *ctx);
+
+// ec_field_element_to_scalar reduces |r| modulo |group->order|. |r| must
+// previously have been reduced modulo |group->field|.
+int ec_field_element_to_scalar(const EC_GROUP *group, BIGNUM *r);
+
 void ec_GFp_simple_mul(const EC_GROUP *group, EC_RAW_POINT *r,
                        const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
                        const EC_SCALAR *p_scalar);
@@ -332,6 +357,14 @@
 void ec_simple_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
                                      const EC_SCALAR *a);
 
+int ec_GFp_simple_mont_inv_mod_ord_vartime(const EC_GROUP *group, EC_SCALAR *r,
+                                           const EC_SCALAR *a);
+
+// ec_GFp_simple_cmp_x_coordinate compares the x (affine) coordinate of |p|, mod
+// the group order, with |r|. It returns one on success or zero otherwise.
+int ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p,
+                                   const BIGNUM *r, BN_CTX *ctx);
+
 // method functions in montgomery.c
 int ec_GFp_mont_group_init(EC_GROUP *);
 int ec_GFp_mont_group_set_curve(EC_GROUP *, const BIGNUM *p, const BIGNUM *a,
diff --git a/crypto/fipsmodule/ec/p224-64.c b/crypto/fipsmodule/ec/p224-64.c
index 606108f..8c7a812 100644
--- a/crypto/fipsmodule/ec/p224-64.c
+++ b/crypto/fipsmodule/ec/p224-64.c
@@ -1107,6 +1107,8 @@
   out->bignum_to_felem = ec_GFp_nistp224_bignum_to_felem;
   out->felem_to_bignum = ec_GFp_nistp224_felem_to_bignum;
   out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
+  out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
+  out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
 };
 
 #endif  // BORINGSSL_HAS_UINT128 && !SMALL
diff --git a/crypto/fipsmodule/ec/p256-x86_64.c b/crypto/fipsmodule/ec/p256-x86_64.c
index a4f6515..e630cb7 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/crypto/fipsmodule/ec/p256-x86_64.c
@@ -44,6 +44,12 @@
     TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe),
 };
 
+// P256_ORDER is the order of the P-256 group, not in Montgomery form.
+static const BN_ULONG P256_ORDER[P256_LIMBS] = {
+    TOBN(0xf3b9cac2, 0xfc632551), TOBN(0xbce6faad, 0xa7179e84),
+    TOBN(0xffffffff, 0xffffffff), TOBN(0xffffffff, 0x00000000),
+};
+
 // Precomputed tables for the default generator
 #include "p256-x86_64-table.h"
 
@@ -289,19 +295,63 @@
   ecp_nistz256_point_add(r, r, &h);
 }
 
+typedef union {
+  P256_POINT p;
+  P256_POINT_AFFINE a;
+} p256_point_union_t;
+
+static unsigned calc_first_wvalue(unsigned *index, const uint8_t p_str[33]) {
+  static const unsigned kWindowSize = 7;
+  static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
+  *index = kWindowSize;
+
+  unsigned wvalue = (p_str[0] << 1) & kMask;
+  return booth_recode_w7(wvalue);
+}
+
+static unsigned calc_wvalue(unsigned *index, const uint8_t p_str[33]) {
+  static const unsigned kWindowSize = 7;
+  static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
+
+  const unsigned off = (*index - 1) / 8;
+  unsigned wvalue = p_str[off] | p_str[off + 1] << 8;
+  wvalue = (wvalue >> ((*index - 1) % 8)) & kMask;
+  *index += kWindowSize;
+
+  return booth_recode_w7(wvalue);
+}
+
+static void mul_p_add_and_store(const EC_GROUP *group, EC_RAW_POINT *r,
+                                const EC_SCALAR *g_scalar,
+                                const EC_RAW_POINT *p_,
+                                const EC_SCALAR *p_scalar,
+                                p256_point_union_t *t, p256_point_union_t *p) {
+  const int p_is_infinity = g_scalar == NULL;
+  if (p_scalar != NULL) {
+    P256_POINT *out = &t->p;
+    if (p_is_infinity) {
+      out = &p->p;
+    }
+
+    ecp_nistz256_windowed_mul(group, out, p_, p_scalar);
+    if (!p_is_infinity) {
+      ecp_nistz256_point_add(&p->p, &p->p, out);
+    }
+  }
+
+  assert(group->field.width == P256_LIMBS);
+  OPENSSL_memcpy(r->X.words, p->p.X, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Y.words, p->p.Y, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Z.words, p->p.Z, P256_LIMBS * sizeof(BN_ULONG));
+}
+
 static void ecp_nistz256_points_mul(const EC_GROUP *group, EC_RAW_POINT *r,
                                     const EC_SCALAR *g_scalar,
                                     const EC_RAW_POINT *p_,
                                     const EC_SCALAR *p_scalar) {
   assert((p_ != NULL) == (p_scalar != NULL));
 
-  static const unsigned kWindowSize = 7;
-  static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
-
-  alignas(32) union {
-    P256_POINT p;
-    P256_POINT_AFFINE a;
-  } t, p;
+  alignas(32) p256_point_union_t t, p;
 
   if (g_scalar != NULL) {
     uint8_t p_str[33];
@@ -309,10 +359,8 @@
     p_str[32] = 0;
 
     // First window
-    unsigned wvalue = (p_str[0] << 1) & kMask;
-    unsigned index = kWindowSize;
-
-    wvalue = booth_recode_w7(wvalue);
+    unsigned index = 0;
+    unsigned wvalue = calc_first_wvalue(&index, p_str);
 
     const PRECOMP256_ROW *const precomputed_table =
         (const PRECOMP256_ROW *)ecp_nistz256_precomputed;
@@ -328,12 +376,7 @@
     copy_conditional(p.p.Z, ONE, is_not_zero(wvalue >> 1));
 
     for (int i = 1; i < 37; i++) {
-      unsigned off = (index - 1) / 8;
-      wvalue = p_str[off] | p_str[off + 1] << 8;
-      wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
-      index += kWindowSize;
-
-      wvalue = booth_recode_w7(wvalue);
+      wvalue = calc_wvalue(&index, p_str);
 
       ecp_nistz256_select_w7(&t.a, precomputed_table[i], wvalue >> 1);
 
@@ -344,23 +387,60 @@
     }
   }
 
-  const int p_is_infinity = g_scalar == NULL;
-  if (p_scalar != NULL) {
-    P256_POINT *out = &t.p;
-    if (p_is_infinity) {
-      out = &p.p;
-    }
+  mul_p_add_and_store(group, r, g_scalar, p_, p_scalar, &t, &p);
+}
 
-    ecp_nistz256_windowed_mul(group, out, p_, p_scalar);
-    if (!p_is_infinity) {
-      ecp_nistz256_point_add(&p.p, &p.p, out);
-    }
+static void ecp_nistz256_points_mul_public(const EC_GROUP *group,
+                                           EC_RAW_POINT *r,
+                                           const EC_SCALAR *g_scalar,
+                                           const EC_RAW_POINT *p_,
+                                           const EC_SCALAR *p_scalar) {
+  assert(p_ != NULL && p_scalar != NULL && g_scalar != NULL);
+
+  alignas(32) p256_point_union_t t, p;
+  uint8_t p_str[33];
+  OPENSSL_memcpy(p_str, g_scalar->bytes, 32);
+  p_str[32] = 0;
+
+  // First window
+  unsigned index = 0;
+  unsigned wvalue = calc_first_wvalue(&index, p_str);
+
+  const PRECOMP256_ROW *const precomputed_table =
+      (const PRECOMP256_ROW *)ecp_nistz256_precomputed;
+
+  // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p|
+  // is infinity and |ONE| otherwise. |p| was computed from the table, so it
+  // is infinity iff |wvalue >> 1| is zero.
+  if ((wvalue >> 1) != 0) {
+    OPENSSL_memcpy(&p.a, &precomputed_table[0][(wvalue >> 1) - 1], sizeof(p.a));
+    OPENSSL_memcpy(&p.p.Z, ONE, sizeof(p.p.Z));
+  } else {
+    OPENSSL_memset(&p.a, 0, sizeof(p.a));
+    OPENSSL_memset(p.p.Z, 0, sizeof(p.p.Z));
   }
 
-  assert(group->field.width == P256_LIMBS);
-  OPENSSL_memcpy(r->X.words, p.p.X, P256_LIMBS * sizeof(BN_ULONG));
-  OPENSSL_memcpy(r->Y.words, p.p.Y, P256_LIMBS * sizeof(BN_ULONG));
-  OPENSSL_memcpy(r->Z.words, p.p.Z, P256_LIMBS * sizeof(BN_ULONG));
+  if ((wvalue & 1) == 1) {
+    ecp_nistz256_neg(p.p.Y, p.p.Y);
+  }
+
+  for (int i = 1; i < 37; i++) {
+    wvalue = calc_wvalue(&index, p_str);
+
+    if ((wvalue >> 1) == 0) {
+      continue;
+    }
+
+    OPENSSL_memcpy(&t.a, &precomputed_table[i][(wvalue >> 1) - 1], sizeof(p.a));
+
+    if ((wvalue & 1) == 1) {
+      ecp_nistz256_neg(t.a.Y, t.a.Y);
+    }
+
+    ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a);
+  }
+
+  mul_p_add_and_store(group, r, g_scalar, p_, p_scalar, &t, &p);
 }
 
 static int ecp_nistz256_get_affine(const EC_GROUP *group,
@@ -486,18 +566,82 @@
   }
 }
 
+static int ecp_nistz256_mont_inv_mod_ord_vartime(const EC_GROUP *group,
+                                                 EC_SCALAR *out,
+                                                 const EC_SCALAR *in) {
+  if (!beeu_mod_inverse_vartime(out->words, in->words, P256_ORDER)) {
+    return 0;
+  }
+
+  // The result should be returned in the Montgomery domain.
+  ec_scalar_to_montgomery(group, out, out);
+  return 1;
+}
+
+static int ecp_nistz256_cmp_x_coordinate(const EC_GROUP *group,
+                                         const EC_POINT *p, const BIGNUM *r,
+                                         BN_CTX *ctx) {
+  if (ec_GFp_simple_is_at_infinity(group, &p->raw)) {
+    OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
+    return 0;
+  }
+
+  BN_ULONG r_words[P256_LIMBS];
+  if (!bn_copy_words(r_words, P256_LIMBS, r)) {
+    return 0;
+  }
+
+  // We wish to compare X/Z^2 with r. This is equivalent to comparing X with
+  // r*Z^2. Note that X and Z are represented in Montgomery form, while r is
+  // not.
+  BN_ULONG r_Z2[P256_LIMBS], Z2_mont[P256_LIMBS], X[P256_LIMBS];
+  ecp_nistz256_mul_mont(Z2_mont, p->raw.Z.words, p->raw.Z.words);
+  ecp_nistz256_mul_mont(r_Z2, r_words, Z2_mont);
+  ecp_nistz256_from_mont(X, p->raw.X.words);
+
+  if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) {
+    return 1;
+  }
+
+  // During signing the x coefficient is reduced modulo the group order.
+  // Therefore there is a small possibility, less than 1/2^128, that group_order
+  // < p.x < P. in that case we need not only to compare against |r| but also to
+  // compare against r+group_order.
+
+  // P_MINUS_ORDER is the difference between the field order (p) and the group
+  // order (N). This value is not in the Montgomery domain.
+  static const BN_ULONG P_MINUS_ORDER[P256_LIMBS] = {
+      TOBN(0x0c46353d, 0x039cdaae), TOBN(0x43190553, 0x58e8617b),
+      TOBN(0x00000000, 0x00000000), TOBN(0x00000000, 0x00000000)};
+
+  if (bn_less_than_words(r_words, P_MINUS_ORDER, P256_LIMBS)) {
+    // We can add in-place, ignoring the carry, because: r + group_order < p <
+    // 2^256
+    bn_add_words(r_words, r_words, P256_ORDER, P256_LIMBS);
+    ecp_nistz256_mul_mont(r_Z2, r_words, Z2_mont);
+    if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) {
+      return 1;
+    }
+  }
+
+  OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
+  return 0;
+}
+
 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
   out->group_init = ec_GFp_mont_group_init;
   out->group_finish = ec_GFp_mont_group_finish;
   out->group_set_curve = ec_GFp_mont_group_set_curve;
   out->point_get_affine_coordinates = ecp_nistz256_get_affine;
   out->mul = ecp_nistz256_points_mul;
-  out->mul_public = ecp_nistz256_points_mul;
+  out->mul_public = ecp_nistz256_points_mul_public;
   out->felem_mul = ec_GFp_mont_felem_mul;
   out->felem_sqr = ec_GFp_mont_felem_sqr;
   out->bignum_to_felem = ec_GFp_mont_bignum_to_felem;
   out->felem_to_bignum = ec_GFp_mont_felem_to_bignum;
   out->scalar_inv_montgomery = ecp_nistz256_inv_mod_ord;
+  out->scalar_inv_montgomery_vartime = ecp_nistz256_mont_inv_mod_ord_vartime;
+  out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate;
 };
 
 #endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
diff --git a/crypto/fipsmodule/ec/p256-x86_64.h b/crypto/fipsmodule/ec/p256-x86_64.h
index 21b461c..9de3240 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.h
+++ b/crypto/fipsmodule/ec/p256-x86_64.h
@@ -61,6 +61,16 @@
   ecp_nistz256_mul_mont(res, in, ONE);
 }
 
+// ecp_nistz256_to_mont sets |res| to |in|, converted to Montgomery domain
+// by multiplying with RR = 2^512 mod P precomputed for NIST P256 curve.
+static inline void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS],
+                                        const BN_ULONG in[P256_LIMBS]) {
+  static const BN_ULONG RR[P256_LIMBS] = {
+      TOBN(0x00000000, 0x00000003), TOBN(0xfffffffb, 0xffffffff),
+      TOBN(0xffffffff, 0xfffffffe), TOBN(0x00000004, 0xfffffffd)};
+  ecp_nistz256_mul_mont(res, in, RR);
+}
+
 
 // P-256 scalar operations.
 //
@@ -79,6 +89,12 @@
 void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
                                const BN_ULONG a[P256_LIMBS], int rep);
 
+// beeu_mod_inverse_vartime sets out = a^-1 mod p using a Euclidean algorithm.
+// Assumption: 0 < a < p < 2^(256) and p is odd.
+int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
+                             const BN_ULONG a[P256_LIMBS],
+                             const BN_ULONG p[P256_LIMBS]);
+
 
 // P-256 point operations.
 //
diff --git a/crypto/fipsmodule/ec/p256-x86_64_test.cc b/crypto/fipsmodule/ec/p256-x86_64_test.cc
index 8ed1dd4..68749e2 100644
--- a/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/crypto/fipsmodule/ec/p256-x86_64_test.cc
@@ -24,8 +24,11 @@
 #include <gtest/gtest.h>
 
 #include <openssl/bn.h>
+#include <openssl/ec.h>
 #include <openssl/mem.h>
+#include <openssl/nid.h>
 
+#include "internal.h"
 #include "../bn/internal.h"
 #include "../../internal.h"
 #include "../../test/file_test.h"
@@ -87,6 +90,59 @@
   }
 }
 
+TEST(P256_X86_64Test, BEEU) {
+  bssl::UniquePtr<EC_GROUP> group(
+      EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1));
+  ASSERT_TRUE(group);
+
+  BN_ULONG order_words[P256_LIMBS];
+  ASSERT_TRUE(
+      bn_copy_words(order_words, P256_LIMBS, EC_GROUP_get0_order(group.get())));
+
+  BN_ULONG in[P256_LIMBS], out[P256_LIMBS];
+  EC_SCALAR in_scalar, out_scalar, result;
+  OPENSSL_memset(in, 0, sizeof(in));
+
+  // Trying to find the inverse of zero should fail.
+  ASSERT_FALSE(beeu_mod_inverse_vartime(out, in, order_words));
+
+  // kOneMont is 1, in Montgomery form.
+  static const BN_ULONG kOneMont[P256_LIMBS] = {
+      TOBN(0xc46353d, 0x039cdaaf), TOBN(0x43190552, 0x58e8617b),
+      0, 0xffffffff,
+  };
+
+  for (BN_ULONG i = 1; i < 2000; i++) {
+    SCOPED_TRACE(i);
+
+    in[0] = i;
+    if (i >= 1000) {
+      in[1] = i << 8;
+      in[2] = i << 32;
+      in[3] = i << 48;
+    } else {
+      in[1] = in[2] = in[3] = 0;
+    }
+
+    EXPECT_TRUE(bn_less_than_words(in, order_words, P256_LIMBS));
+    ASSERT_TRUE(beeu_mod_inverse_vartime(out, in, order_words));
+    EXPECT_TRUE(bn_less_than_words(out, order_words, P256_LIMBS));
+
+    // Calculate out*in and confirm that it equals one, modulo the order.
+    OPENSSL_memcpy(in_scalar.bytes, in, sizeof(in));
+    OPENSSL_memcpy(out_scalar.bytes, out, sizeof(out));
+    ec_scalar_to_montgomery(group.get(), &in_scalar, &in_scalar);
+    ec_scalar_to_montgomery(group.get(), &out_scalar, &out_scalar);
+    ec_scalar_mul_montgomery(group.get(), &result, &in_scalar, &out_scalar);
+
+    EXPECT_EQ(0, OPENSSL_memcmp(kOneMont, &result, sizeof(kOneMont)));
+
+    // Invert the result and expect to get back to the original value.
+    ASSERT_TRUE(beeu_mod_inverse_vartime(out, out, order_words));
+    EXPECT_EQ(0, OPENSSL_memcmp(in, out, sizeof(in)));
+  }
+}
+
 static bool GetFieldElement(FileTest *t, BN_ULONG out[P256_LIMBS],
                             const char *name) {
   std::vector<uint8_t> bytes;
diff --git a/crypto/fipsmodule/ec/scalar.c b/crypto/fipsmodule/ec/scalar.c
index 1bd6b02..88678a9 100644
--- a/crypto/fipsmodule/ec/scalar.c
+++ b/crypto/fipsmodule/ec/scalar.c
@@ -74,3 +74,8 @@
                               const EC_SCALAR *a) {
   group->meth->scalar_inv_montgomery(group, r, a);
 }
+
+int ec_scalar_inv_montgomery_vartime(const EC_GROUP *group, EC_SCALAR *r,
+                                     const EC_SCALAR *a) {
+  return group->meth->scalar_inv_montgomery_vartime(group, r, a);
+}
diff --git a/crypto/fipsmodule/ec/simple.c b/crypto/fipsmodule/ec/simple.c
index 5c63711..854da1e 100644
--- a/crypto/fipsmodule/ec/simple.c
+++ b/crypto/fipsmodule/ec/simple.c
@@ -570,3 +570,55 @@
   // The points are equal.
   return 0;
 }
+
+int ec_GFp_simple_mont_inv_mod_ord_vartime(const EC_GROUP *group,
+                                           EC_SCALAR *out,
+                                           const EC_SCALAR *in) {
+  // This implementation (in fact) runs in constant time,
+  // even though for this interface it is not mandatory.
+
+  // out = in^-1 in the Montgomery domain. This is
+  // |ec_scalar_to_montgomery| followed by |ec_scalar_inv_montgomery|, but
+  // |ec_scalar_inv_montgomery| followed by |ec_scalar_from_montgomery| is
+  // equivalent and slightly more efficient.
+  ec_scalar_inv_montgomery(group, out, in);
+  ec_scalar_from_montgomery(group, out, out);
+  return 1;
+}
+
+// Compares the x (affine) coordinate of the point p with x.
+// Return 1 on success 0 otherwise
+// Assumption: the caller starts the BN_CTX.
+int ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p,
+                                   const BIGNUM *r, BN_CTX *ctx) {
+  int ret = 0;
+  BN_CTX_start(ctx);
+
+  BIGNUM *X = BN_CTX_get(ctx);
+  if (X == NULL) {
+    OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB);
+    goto out;
+  }
+
+  if (!EC_POINT_get_affine_coordinates_GFp(group, p, X, NULL, ctx)) {
+    OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB);
+    goto out;
+  }
+
+  if (!ec_field_element_to_scalar(group, X)) {
+    OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB);
+    goto out;
+  }
+
+  // The signature is correct iff |X| is equal to |r|.
+  if (BN_ucmp(X, r) != 0) {
+    OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
+    goto out;
+  }
+
+  ret = 1;
+
+out:
+  BN_CTX_end(ctx);
+  return ret;
+}
diff --git a/crypto/fipsmodule/ecdsa/ecdsa.c b/crypto/fipsmodule/ecdsa/ecdsa.c
index f3ce214..7ec4565 100644
--- a/crypto/fipsmodule/ecdsa/ecdsa.c
+++ b/crypto/fipsmodule/ecdsa/ecdsa.c
@@ -98,40 +98,6 @@
                           order->width);
 }
 
-// field_element_to_scalar reduces |r| modulo |group->order|. |r| must
-// previously have been reduced modulo |group->field|.
-static int field_element_to_scalar(const EC_GROUP *group, BIGNUM *r) {
-  // We must have p < 2×order, assuming p is not tiny (p >= 17). Thus rather we
-  // can reduce by performing at most one subtraction.
-  //
-  // Proof: We only work with prime order curves, so the number of points on
-  // the curve is the order. Thus Hasse's theorem gives:
-  //
-  //     |order - (p + 1)| <= 2×sqrt(p)
-  //         p + 1 - order <= 2×sqrt(p)
-  //     p + 1 - 2×sqrt(p) <= order
-  //       p + 1 - 2×(p/4)  < order       (p/4 > sqrt(p) for p >= 17)
-  //         p/2 < p/2 + 1  < order
-  //                     p  < 2×order
-  //
-  // Additionally, one can manually check this property for built-in curves. It
-  // is enforced for legacy custom curves in |EC_GROUP_set_generator|.
-  //
-  // TODO(davidben): Introduce |EC_FIELD_ELEMENT|, make this a function from
-  // |EC_FIELD_ELEMENT| to |EC_SCALAR|, and cut out the |BIGNUM|. Does this need
-  // to be constant-time for signing? |r| is the x-coordinate for kG, which is
-  // public unless k was rerolled because |s| was zero.
-  assert(!BN_is_negative(r));
-  assert(BN_cmp(r, &group->field) < 0);
-  if (BN_cmp(r, &group->order) >= 0 &&
-      !BN_sub(r, r, &group->order)) {
-    return 0;
-  }
-  assert(!BN_is_negative(r));
-  assert(BN_cmp(r, &group->order) < 0);
-  return 1;
-}
-
 ECDSA_SIG *ECDSA_SIG_new(void) {
   ECDSA_SIG *sig = OPENSSL_malloc(sizeof(ECDSA_SIG));
   if (sig == NULL) {
@@ -193,12 +159,6 @@
   }
   int ret = 0;
   EC_POINT *point = NULL;
-  BN_CTX_start(ctx);
-  BIGNUM *X = BN_CTX_get(ctx);
-  if (X == NULL) {
-    OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB);
-    goto err;
-  }
 
   EC_SCALAR r, s, u1, u2, s_inv_mont, m;
   if (BN_is_zero(sig->r) ||
@@ -210,11 +170,7 @@
   }
 
   // s_inv_mont = s^-1 in the Montgomery domain. This is
-  // |ec_scalar_to_montgomery| followed by |ec_scalar_inv_montgomery|, but
-  // |ec_scalar_inv_montgomery| followed by |ec_scalar_from_montgomery| is
-  // equivalent and slightly more efficient.
-  ec_scalar_inv_montgomery(group, &s_inv_mont, &s);
-  ec_scalar_from_montgomery(group, &s_inv_mont, &s_inv_mont);
+  ec_scalar_inv_montgomery_vartime(group, &s_inv_mont, &s);
 
   // u1 = m * s^-1 mod order
   // u2 = r * s^-1 mod order
@@ -234,24 +190,10 @@
     OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB);
     goto err;
   }
-  if (!EC_POINT_get_affine_coordinates_GFp(group, point, X, NULL, ctx)) {
-    OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB);
-    goto err;
-  }
-  if (!field_element_to_scalar(group, X)) {
-    OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB);
-    goto err;
-  }
-  // The signature is correct iff |X| is equal to |sig->r|.
-  if (BN_ucmp(X, sig->r) != 0) {
-    OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
-    goto err;
-  }
 
-  ret = 1;
+  ret = ec_cmp_x_coordinate(group, point, sig->r, ctx);
 
 err:
-  BN_CTX_end(ctx);
   BN_CTX_free(ctx);
   EC_POINT_free(point);
   return ret;
@@ -320,7 +262,7 @@
       goto err;
     }
 
-    if (!field_element_to_scalar(group, r)) {
+    if (!ec_field_element_to_scalar(group, r)) {
       goto err;
     }
   } while (BN_is_zero(r));
diff --git a/third_party/fiat/p256.c b/third_party/fiat/p256.c
index f42f8fe..e962d56 100644
--- a/third_party/fiat/p256.c
+++ b/third_party/fiat/p256.c
@@ -1807,6 +1807,8 @@
   out->bignum_to_felem = ec_GFp_mont_bignum_to_felem;
   out->felem_to_bignum = ec_GFp_mont_felem_to_bignum;
   out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
+  out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
+  out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
 };
 
 #undef BORINGSSL_NISTP256_64BIT