Revert "Revert "Speed up ECDSA verify on x86-64.""

This reverts commit e907ed4c4b92cd7cea250240e32695c73dacf7eb. CPUID
checks have been added so hopefully this time sticks.

Change-Id: I5e0e5b87427c1230132681f936b3c70bac8263b8
Reviewed-on: https://boringssl-review.googlesource.com/c/32924
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index e6c8cc6..463febb 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@@ -11,6 +11,7 @@
     ghash-x86_64.${ASM_EXT}
     md5-x86_64.${ASM_EXT}
     p256-x86_64-asm.${ASM_EXT}
+    p256_beeu-x86_64-asm.${ASM_EXT}
     rdrand-x86_64.${ASM_EXT}
     rsaz-avx2.${ASM_EXT}
     sha1-x86_64.${ASM_EXT}
@@ -100,6 +101,7 @@
 perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl)
 perlasm(md5-x86_64.${ASM_EXT} md5/asm/md5-x86_64.pl)
 perlasm(p256-x86_64-asm.${ASM_EXT} ec/asm/p256-x86_64-asm.pl)
+perlasm(p256_beeu-x86_64-asm.${ASM_EXT} ec/asm/p256_beeu-x86_64-asm.pl)
 perlasm(rdrand-x86_64.${ASM_EXT} rand/asm/rdrand-x86_64.pl)
 perlasm(rsaz-avx2.${ASM_EXT} bn/asm/rsaz-avx2.pl)
 perlasm(sha1-586.${ASM_EXT} sha/asm/sha1-586.pl)
diff --git a/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl b/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl
new file mode 100644
index 0000000..12b9f5a
--- /dev/null
+++ b/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl
@@ -0,0 +1,405 @@
+# Copyright (c) 2018, Amazon Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#
+# Written by Nir Drucker, and Shay Gueron
+# AWS Cryptographic Algorithms Group
+# (ndrucker@amazon.com, gueron@amazon.com)
+# based on BN_mod_inverse_odd
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+#############################################################################
+# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
+#                                     BN_ULONG a[P256_LIMBS],
+#                                     BN_ULONG n[P256_LIMBS]);
+#
+# (Binary Extended Euclidean Algorithm.
+#  See https://en.wikipedia.org/wiki/Binary_GCD_algorithm)
+#
+# Assumption 1: n is odd for the BEEU
+# Assumption 2: 1 < a < n < 2^256
+
+$out = "%rdi";
+$a = "%rsi";
+$n = "%rdx";
+
+# X/Y will hold the inverse parameter
+# Assumption: X,Y<2^(256)
+$x0 = "%r8";
+$x1 = "%r9";
+$x2 = "%r10";
+$x3 = "%r11";
+# borrow from out (out is needed only at the end)
+$x4 = "%rdi";
+$y0 = "%r12";
+$y1 = "%r13";
+$y2 = "%r14";
+$y3 = "%r15";
+$y4 = "%rbp";
+$shift = "%rcx";
+$t0 = "%rax";
+$t1 = "%rbx";
+$t2 = "%rsi";
+# borrow
+$t3 = "%rcx";
+
+$T0 = "%xmm0";
+$T1 = "%xmm1";
+
+# Offsets on the stack
+$out_rsp = 0;
+$shift_rsp = $out_rsp+0x8;
+$a_rsp0 = $shift_rsp+0x8;
+$a_rsp1 = $a_rsp0+0x8;
+$a_rsp2 = $a_rsp1+0x8;
+$a_rsp3 = $a_rsp2+0x8;
+$b_rsp0 = $a_rsp3+0x8;
+$b_rsp1 = $b_rsp0+0x8;
+$b_rsp2 = $b_rsp1+0x8;
+$b_rsp3 = $b_rsp2+0x8;
+
+# Borrow when a_rsp/b_rsp are no longer needed.
+$y_rsp0 = $a_rsp0;
+$y_rsp1 = $y_rsp0+0x8;
+$y_rsp2 = $y_rsp1+0x8;
+$y_rsp3 = $y_rsp2+0x8;
+$y_rsp4 = $y_rsp3+0x8;
+$last_rsp_offset = $b_rsp3+0x8;
+
+sub TEST_B_ZERO {
+  return <<___;
+    xorq $t1, $t1
+    or $b_rsp0(%rsp), $t1
+    or $b_rsp1(%rsp), $t1
+    or $b_rsp2(%rsp), $t1
+    or $b_rsp3(%rsp), $t1
+    jz .Lbeeu_loop_end
+___
+}
+
+$g_next_label = 0;
+
+sub SHIFT1 {
+  my ($var0, $var1, $var2, $var3, $var4) = @_;
+  my $label = ".Lshift1_${g_next_label}";
+  $g_next_label++;
+
+  return <<___;
+    # Ensure X is even and divide by two.
+    movq \$1, $t1
+    andq $var0, $t1
+    jz $label
+    add 0*8($n), $var0
+    adc 1*8($n), $var1
+    adc 2*8($n), $var2
+    adc 3*8($n), $var3
+    adc \$0, $var4
+
+$label:
+    shrdq \$1, $var1, $var0
+    shrdq \$1, $var2, $var1
+    shrdq \$1, $var3, $var2
+    shrdq \$1, $var4, $var3
+    shrq  \$1, $var4
+___
+}
+
+sub SHIFT256 {
+  my ($var) = @_;
+  return <<___;
+    # Copy shifted values.
+    # Remember not to override t3=rcx
+    movq 1*8+$var(%rsp), $t0
+    movq 2*8+$var(%rsp), $t1
+    movq 3*8+$var(%rsp), $t2
+
+    shrdq %cl, $t0, 0*8+$var(%rsp)
+    shrdq %cl, $t1, 1*8+$var(%rsp)
+    shrdq %cl, $t2, 2*8+$var(%rsp)
+
+    shrq  %cl, $t2
+    mov $t2, 3*8+$var(%rsp)
+___
+}
+
+$code.=<<___;
+.text
+
+.type beeu_mod_inverse_vartime,\@function
+.hidden beeu_mod_inverse_vartime
+.globl  beeu_mod_inverse_vartime
+.align 32
+beeu_mod_inverse_vartime:
+.cfi_startproc
+    push %rbp
+.cfi_push rbp
+    movq %rsp, %rbp
+.cfi_def_cfa_register rbp
+
+    push %r12
+.cfi_push r12
+    push %r13
+.cfi_push r13
+    push %r14
+.cfi_push r14
+    push %r15
+.cfi_push r15
+    push %rbx
+.cfi_push rbx
+    push %rsi
+.cfi_push rsi
+
+    sub \$$last_rsp_offset, %rsp
+    movq $out, $out_rsp(%rsp)
+
+    # X=1, Y=0
+    movq \$1, $x0
+    xorq $x1, $x1
+    xorq $x2, $x2
+    xorq $x3, $x3
+    xorq $x4, $x4
+
+    xorq $y0, $y0
+    xorq $y1, $y1
+    xorq $y2, $y2
+    xorq $y3, $y3
+    xorq $y4, $y4
+
+    # Copy a/n into B/A on the stack.
+    vmovdqu 0*8($a), $T0
+    vmovdqu 2*8($a), $T1
+    vmovdqu $T0, $b_rsp0(%rsp)
+    vmovdqu $T1, $b_rsp2(%rsp)
+
+    vmovdqu 0*8($n), $T0
+    vmovdqu 2*8($n), $T1
+    vmovdqu $T0, $a_rsp0(%rsp)
+    vmovdqu $T1, $a_rsp2(%rsp)
+
+.Lbeeu_loop:
+    ${\TEST_B_ZERO}
+
+    # 0 < B < |n|,
+    # 0 < A <= |n|,
+    # (1)      X*a  ==  B   (mod |n|),
+    # (2) (-1)*Y*a  ==  A   (mod |n|)
+
+    # Now divide B by the maximum possible power of two in the
+    # integers, and divide X by the same value mod |n|. When we're
+    # done, (1) still holds.
+    movq \$1, $shift
+
+    # Note that B > 0
+.Lbeeu_shift_loop_XB:
+    movq $shift, $t1
+    andq $b_rsp0(%rsp), $t1
+    jnz .Lbeeu_shift_loop_end_XB
+
+    ${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
+    shl \$1, $shift
+
+    # Test wraparound of the shift parameter. The probability to have 32 zeroes
+    # in a row is small Therefore having the value below equal \$0x8000000 or
+    # \$0x8000 does not affect the performance. We choose 0x8000000 because it
+    # is the maximal immediate value possible.
+    cmp \$0x8000000, $shift
+    jne .Lbeeu_shift_loop_XB
+
+.Lbeeu_shift_loop_end_XB:
+    bsf $shift, $shift
+    test $shift, $shift
+    jz .Lbeeu_no_shift_XB
+
+    ${\SHIFT256($b_rsp0)}
+
+.Lbeeu_no_shift_XB:
+    # Same for A and Y.  Afterwards, (2) still holds.
+    movq \$1, $shift
+
+    # Note that A > 0
+.Lbeeu_shift_loop_YA:
+    movq $shift, $t1
+    andq $a_rsp0(%rsp), $t1
+    jnz .Lbeeu_shift_loop_end_YA
+
+    ${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
+    shl \$1, $shift
+
+    # Test wraparound of the shift parameter. The probability to have 32 zeroes
+    # in a row is small therefore having the value below equal \$0x8000000 or
+    # \$0x8000 Does not affect the performance. We choose 0x8000000 because it
+    # is the maximal immediate value possible.
+    cmp \$0x8000000, $shift
+    jne .Lbeeu_shift_loop_YA
+
+.Lbeeu_shift_loop_end_YA:
+    bsf $shift, $shift
+    test $shift, $shift
+    jz .Lbeeu_no_shift_YA
+
+    ${\SHIFT256($a_rsp0)}
+
+.Lbeeu_no_shift_YA:
+    # T = B-A (A,B < 2^256)
+    mov $b_rsp0(%rsp), $t0
+    mov $b_rsp1(%rsp), $t1
+    mov $b_rsp2(%rsp), $t2
+    mov $b_rsp3(%rsp), $t3
+    sub $a_rsp0(%rsp), $t0
+    sbb $a_rsp1(%rsp), $t1
+    sbb $a_rsp2(%rsp), $t2
+    sbb $a_rsp3(%rsp), $t3  # borrow from shift
+    jnc .Lbeeu_B_bigger_than_A
+
+    # A = A - B
+    mov $a_rsp0(%rsp), $t0
+    mov $a_rsp1(%rsp), $t1
+    mov $a_rsp2(%rsp), $t2
+    mov $a_rsp3(%rsp), $t3
+    sub $b_rsp0(%rsp), $t0
+    sbb $b_rsp1(%rsp), $t1
+    sbb $b_rsp2(%rsp), $t2
+    sbb $b_rsp3(%rsp), $t3
+    mov $t0, $a_rsp0(%rsp)
+    mov $t1, $a_rsp1(%rsp)
+    mov $t2, $a_rsp2(%rsp)
+    mov $t3, $a_rsp3(%rsp)
+
+    # Y = Y + X
+    add $x0, $y0
+    adc $x1, $y1
+    adc $x2, $y2
+    adc $x3, $y3
+    adc $x4, $y4
+    jmp .Lbeeu_loop
+
+.Lbeeu_B_bigger_than_A:
+    # B = T = B - A
+    mov $t0, $b_rsp0(%rsp)
+    mov $t1, $b_rsp1(%rsp)
+    mov $t2, $b_rsp2(%rsp)
+    mov $t3, $b_rsp3(%rsp)
+
+    # X = Y + X
+    add $y0, $x0
+    adc $y1, $x1
+    adc $y2, $x2
+    adc $y3, $x3
+    adc $y4, $x4
+
+    jmp .Lbeeu_loop
+
+.Lbeeu_loop_end:
+    # The Euclid's algorithm loop ends when A == beeu(a,n);
+    # Therefore (-1)*Y*a == A (mod |n|), Y>0
+
+    # Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
+    mov $a_rsp0(%rsp), $t1
+    sub \$1, $t1
+    or $a_rsp1(%rsp), $t1
+    or $a_rsp2(%rsp), $t1
+    or $a_rsp3(%rsp), $t1
+    # If not, fail.
+    jnz .Lbeeu_err
+
+    # From this point on, we no longer need X
+    # Therefore we use it as a temporary storage.
+    # X = n
+    movq 0*8($n), $x0
+    movq 1*8($n), $x1
+    movq 2*8($n), $x2
+    movq 3*8($n), $x3
+    xorq $x4, $x4
+
+.Lbeeu_reduction_loop:
+    movq $y0, $y_rsp0(%rsp)
+    movq $y1, $y_rsp1(%rsp)
+    movq $y2, $y_rsp2(%rsp)
+    movq $y3, $y_rsp3(%rsp)
+    movq $y4, $y_rsp4(%rsp)
+
+    # If Y>n ==> Y=Y-n
+    sub $x0, $y0
+    sbb $x1, $y1
+    sbb $x2, $y2
+    sbb $x3, $y3
+    sbb \$0, $y4
+
+    # Choose old Y or new Y
+    cmovc $y_rsp0(%rsp), $y0
+    cmovc $y_rsp1(%rsp), $y1
+    cmovc $y_rsp2(%rsp), $y2
+    cmovc $y_rsp3(%rsp), $y3
+    jnc .Lbeeu_reduction_loop
+
+    # X = n - Y (n, Y < 2^256), (Cancel the (-1))
+    sub $y0, $x0
+    sbb $y1, $x1
+    sbb $y2, $x2
+    sbb $y3, $x3
+
+.Lbeeu_save:
+    # Save the inverse(<2^256) to out.
+    mov $out_rsp(%rsp), $out
+
+    movq $x0, 0*8($out)
+    movq $x1, 1*8($out)
+    movq $x2, 2*8($out)
+    movq $x3, 3*8($out)
+
+    # Return 1.
+    movq \$1, %rax
+    jmp .Lbeeu_finish
+
+.Lbeeu_err:
+    # Return 0.
+    xorq %rax, %rax
+
+.Lbeeu_finish:
+    add \$$last_rsp_offset, %rsp
+    pop %rsi
+.cfi_pop rsi
+    pop %rbx
+.cfi_pop rbx
+    pop %r15
+.cfi_pop r15
+    pop %r14
+.cfi_pop r14
+    pop %r13
+.cfi_pop r13
+    pop %r12
+.cfi_pop r12
+    pop %rbp
+.cfi_pop rbp
+.cfi_def_cfa rsp, 8
+.cfi_endproc
+    ret
+
+.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/fipsmodule/ec/ec.c b/crypto/fipsmodule/ec/ec.c
index 134d051..4c557a1 100644
--- a/crypto/fipsmodule/ec/ec.c
+++ b/crypto/fipsmodule/ec/ec.c
@@ -911,6 +911,43 @@
   return 1;
 }
 
+int ec_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p,
+                        const BIGNUM *r, BN_CTX *ctx) {
+  return group->meth->cmp_x_coordinate(group, p, r, ctx);
+}
+
+int ec_field_element_to_scalar(const EC_GROUP *group, BIGNUM *r) {
+  // We must have p < 2×order, assuming p is not tiny (p >= 17). Thus rather we
+  // can reduce by performing at most one subtraction.
+  //
+  // Proof: We only work with prime order curves, so the number of points on
+  // the curve is the order. Thus Hasse's theorem gives:
+  //
+  //     |order - (p + 1)| <= 2×sqrt(p)
+  //         p + 1 - order <= 2×sqrt(p)
+  //     p + 1 - 2×sqrt(p) <= order
+  //       p + 1 - 2×(p/4)  < order       (p/4 > sqrt(p) for p >= 17)
+  //         p/2 < p/2 + 1  < order
+  //                     p  < 2×order
+  //
+  // Additionally, one can manually check this property for built-in curves. It
+  // is enforced for legacy custom curves in |EC_GROUP_set_generator|.
+  //
+  // TODO(davidben): Introduce |EC_FIELD_ELEMENT|, make this a function from
+  // |EC_FIELD_ELEMENT| to |EC_SCALAR|, and cut out the |BIGNUM|. Does this need
+  // to be constant-time for signing? |r| is the x-coordinate for kG, which is
+  // public unless k was rerolled because |s| was zero.
+  assert(!BN_is_negative(r));
+  assert(BN_cmp(r, &group->field) < 0);
+  if (BN_cmp(r, &group->order) >= 0 &&
+      !BN_sub(r, r, &group->order)) {
+    return 0;
+  }
+  assert(!BN_is_negative(r));
+  assert(BN_cmp(r, &group->order) < 0);
+  return 1;
+}
+
 void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag) {}
 
 const EC_METHOD *EC_GROUP_method_of(const EC_GROUP *group) {
diff --git a/crypto/fipsmodule/ec/ec_montgomery.c b/crypto/fipsmodule/ec/ec_montgomery.c
index 8f277b0..90bd8ce 100644
--- a/crypto/fipsmodule/ec/ec_montgomery.c
+++ b/crypto/fipsmodule/ec/ec_montgomery.c
@@ -439,4 +439,6 @@
   out->bignum_to_felem = ec_GFp_mont_bignum_to_felem;
   out->felem_to_bignum = ec_GFp_mont_felem_to_bignum;
   out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
+  out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
+  out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
 }
diff --git a/crypto/fipsmodule/ec/internal.h b/crypto/fipsmodule/ec/internal.h
index e62ceb8..0588fb5 100644
--- a/crypto/fipsmodule/ec/internal.h
+++ b/crypto/fipsmodule/ec/internal.h
@@ -167,11 +167,21 @@
   int (*felem_to_bignum)(const EC_GROUP *group, BIGNUM *out,
                          const EC_FELEM *in);
 
-  // scalar_inv_mont sets |out| to |in|^-1, where both input and output are in
-  // Montgomery form.
+  // scalar_inv_montgomery sets |out| to |in|^-1, where both input and output
+  // are in Montgomery form.
   void (*scalar_inv_montgomery)(const EC_GROUP *group, EC_SCALAR *out,
                                 const EC_SCALAR *in);
 
+  // scalar_inv_montgomery_vartime performs the same computation as
+  // |scalar_inv_montgomery|. It further assumes that the inputs are public so
+  // there is no concern about leaking their values through timing.
+  int (*scalar_inv_montgomery_vartime)(const EC_GROUP *group, EC_SCALAR *out,
+                                       const EC_SCALAR *in);
+
+  // cmp_x_coordinate compares the x (affine) coordinate of |p|, mod the group
+  // order, with |r|. It returns one if they are equal and zero otherwise.
+  int (*cmp_x_coordinate)(const EC_GROUP *group, const EC_POINT *p,
+                          const BIGNUM *r, BN_CTX *ctx);
 } /* EC_METHOD */;
 
 const EC_METHOD *EC_GFp_mont_method(void);
@@ -277,6 +287,11 @@
 void ec_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
                               const EC_SCALAR *a);
 
+// ec_scalar_inv_montgomery_vartime performs the same actions as
+// |ec_scalar_inv_montgomery|, but in variable time.
+int ec_scalar_inv_montgomery_vartime(const EC_GROUP *group, EC_SCALAR *r,
+                                     const EC_SCALAR *a);
+
 // ec_point_mul_scalar sets |r| to generator * |g_scalar| + |p| *
 // |p_scalar|. Unlike other functions which take |EC_SCALAR|, |g_scalar| and
 // |p_scalar| need not be fully reduced. They need only contain as many bits as
@@ -292,6 +307,16 @@
     const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
     const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);
 
+// ec_cmp_x_coordinate compares the x (affine) coordinate of |p| with |r|. It
+// returns one if they are equal and zero otherwise. The |ctx| must have been
+// started by the caller.
+int ec_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p,
+                        const BIGNUM *r, BN_CTX *ctx);
+
+// ec_field_element_to_scalar reduces |r| modulo |group->order|. |r| must
+// previously have been reduced modulo |group->field|.
+int ec_field_element_to_scalar(const EC_GROUP *group, BIGNUM *r);
+
 void ec_GFp_mont_mul(const EC_GROUP *group, EC_RAW_POINT *r,
                      const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
                      const EC_SCALAR *p_scalar);
@@ -336,6 +361,14 @@
 void ec_simple_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
                                      const EC_SCALAR *a);
 
+int ec_GFp_simple_mont_inv_mod_ord_vartime(const EC_GROUP *group, EC_SCALAR *r,
+                                           const EC_SCALAR *a);
+
+// ec_GFp_simple_cmp_x_coordinate compares the x (affine) coordinate of |p|, mod
+// the group order, with |r|. It returns one on success or zero otherwise.
+int ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p,
+                                   const BIGNUM *r, BN_CTX *ctx);
+
 // method functions in montgomery.c
 int ec_GFp_mont_group_init(EC_GROUP *);
 int ec_GFp_mont_group_set_curve(EC_GROUP *, const BIGNUM *p, const BIGNUM *a,
diff --git a/crypto/fipsmodule/ec/p224-64.c b/crypto/fipsmodule/ec/p224-64.c
index ebaca85..27bfc36 100644
--- a/crypto/fipsmodule/ec/p224-64.c
+++ b/crypto/fipsmodule/ec/p224-64.c
@@ -1136,6 +1136,8 @@
   out->bignum_to_felem = ec_GFp_nistp224_bignum_to_felem;
   out->felem_to_bignum = ec_GFp_nistp224_felem_to_bignum;
   out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
+  out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
+  out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
 };
 
 #endif  // BORINGSSL_HAS_UINT128 && !SMALL
diff --git a/crypto/fipsmodule/ec/p256-x86_64.c b/crypto/fipsmodule/ec/p256-x86_64.c
index 73d36b6..4008aa4 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/crypto/fipsmodule/ec/p256-x86_64.c
@@ -44,6 +44,12 @@
     TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe),
 };
 
+// P256_ORDER is the order of the P-256 group, not in Montgomery form.
+static const BN_ULONG P256_ORDER[P256_LIMBS] = {
+    TOBN(0xf3b9cac2, 0xfc632551), TOBN(0xbce6faad, 0xa7179e84),
+    TOBN(0xffffffff, 0xffffffff), TOBN(0xffffffff, 0x00000000),
+};
+
 // Precomputed tables for the default generator
 #include "p256-x86_64-table.h"
 
@@ -289,19 +295,63 @@
   ecp_nistz256_point_add(r, r, &h);
 }
 
+typedef union {
+  P256_POINT p;
+  P256_POINT_AFFINE a;
+} p256_point_union_t;
+
+static unsigned calc_first_wvalue(unsigned *index, const uint8_t p_str[33]) {
+  static const unsigned kWindowSize = 7;
+  static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
+  *index = kWindowSize;
+
+  unsigned wvalue = (p_str[0] << 1) & kMask;
+  return booth_recode_w7(wvalue);
+}
+
+static unsigned calc_wvalue(unsigned *index, const uint8_t p_str[33]) {
+  static const unsigned kWindowSize = 7;
+  static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
+
+  const unsigned off = (*index - 1) / 8;
+  unsigned wvalue = p_str[off] | p_str[off + 1] << 8;
+  wvalue = (wvalue >> ((*index - 1) % 8)) & kMask;
+  *index += kWindowSize;
+
+  return booth_recode_w7(wvalue);
+}
+
+static void mul_p_add_and_store(const EC_GROUP *group, EC_RAW_POINT *r,
+                                const EC_SCALAR *g_scalar,
+                                const EC_RAW_POINT *p_,
+                                const EC_SCALAR *p_scalar,
+                                p256_point_union_t *t, p256_point_union_t *p) {
+  const int p_is_infinity = g_scalar == NULL;
+  if (p_scalar != NULL) {
+    P256_POINT *out = &t->p;
+    if (p_is_infinity) {
+      out = &p->p;
+    }
+
+    ecp_nistz256_windowed_mul(group, out, p_, p_scalar);
+    if (!p_is_infinity) {
+      ecp_nistz256_point_add(&p->p, &p->p, out);
+    }
+  }
+
+  assert(group->field.width == P256_LIMBS);
+  OPENSSL_memcpy(r->X.words, p->p.X, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Y.words, p->p.Y, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Z.words, p->p.Z, P256_LIMBS * sizeof(BN_ULONG));
+}
+
 static void ecp_nistz256_points_mul(const EC_GROUP *group, EC_RAW_POINT *r,
                                     const EC_SCALAR *g_scalar,
                                     const EC_RAW_POINT *p_,
                                     const EC_SCALAR *p_scalar) {
   assert((p_ != NULL) == (p_scalar != NULL));
 
-  static const unsigned kWindowSize = 7;
-  static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
-
-  alignas(32) union {
-    P256_POINT p;
-    P256_POINT_AFFINE a;
-  } t, p;
+  alignas(32) p256_point_union_t t, p;
 
   if (g_scalar != NULL) {
     uint8_t p_str[33];
@@ -309,10 +359,8 @@
     p_str[32] = 0;
 
     // First window
-    unsigned wvalue = (p_str[0] << 1) & kMask;
-    unsigned index = kWindowSize;
-
-    wvalue = booth_recode_w7(wvalue);
+    unsigned index = 0;
+    unsigned wvalue = calc_first_wvalue(&index, p_str);
 
     const PRECOMP256_ROW *const precomputed_table =
         (const PRECOMP256_ROW *)ecp_nistz256_precomputed;
@@ -328,12 +376,7 @@
     copy_conditional(p.p.Z, ONE, is_not_zero(wvalue >> 1));
 
     for (int i = 1; i < 37; i++) {
-      unsigned off = (index - 1) / 8;
-      wvalue = p_str[off] | p_str[off + 1] << 8;
-      wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
-      index += kWindowSize;
-
-      wvalue = booth_recode_w7(wvalue);
+      wvalue = calc_wvalue(&index, p_str);
 
       ecp_nistz256_select_w7(&t.a, precomputed_table[i], wvalue >> 1);
 
@@ -344,23 +387,60 @@
     }
   }
 
-  const int p_is_infinity = g_scalar == NULL;
-  if (p_scalar != NULL) {
-    P256_POINT *out = &t.p;
-    if (p_is_infinity) {
-      out = &p.p;
-    }
+  mul_p_add_and_store(group, r, g_scalar, p_, p_scalar, &t, &p);
+}
 
-    ecp_nistz256_windowed_mul(group, out, p_, p_scalar);
-    if (!p_is_infinity) {
-      ecp_nistz256_point_add(&p.p, &p.p, out);
-    }
+static void ecp_nistz256_points_mul_public(const EC_GROUP *group,
+                                           EC_RAW_POINT *r,
+                                           const EC_SCALAR *g_scalar,
+                                           const EC_RAW_POINT *p_,
+                                           const EC_SCALAR *p_scalar) {
+  assert(p_ != NULL && p_scalar != NULL && g_scalar != NULL);
+
+  alignas(32) p256_point_union_t t, p;
+  uint8_t p_str[33];
+  OPENSSL_memcpy(p_str, g_scalar->bytes, 32);
+  p_str[32] = 0;
+
+  // First window
+  unsigned index = 0;
+  unsigned wvalue = calc_first_wvalue(&index, p_str);
+
+  const PRECOMP256_ROW *const precomputed_table =
+      (const PRECOMP256_ROW *)ecp_nistz256_precomputed;
+
+  // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p|
+  // is infinity and |ONE| otherwise. |p| was computed from the table, so it
+  // is infinity iff |wvalue >> 1| is zero.
+  if ((wvalue >> 1) != 0) {
+    OPENSSL_memcpy(&p.a, &precomputed_table[0][(wvalue >> 1) - 1], sizeof(p.a));
+    OPENSSL_memcpy(&p.p.Z, ONE, sizeof(p.p.Z));
+  } else {
+    OPENSSL_memset(&p.a, 0, sizeof(p.a));
+    OPENSSL_memset(p.p.Z, 0, sizeof(p.p.Z));
   }
 
-  assert(group->field.width == P256_LIMBS);
-  OPENSSL_memcpy(r->X.words, p.p.X, P256_LIMBS * sizeof(BN_ULONG));
-  OPENSSL_memcpy(r->Y.words, p.p.Y, P256_LIMBS * sizeof(BN_ULONG));
-  OPENSSL_memcpy(r->Z.words, p.p.Z, P256_LIMBS * sizeof(BN_ULONG));
+  if ((wvalue & 1) == 1) {
+    ecp_nistz256_neg(p.p.Y, p.p.Y);
+  }
+
+  for (int i = 1; i < 37; i++) {
+    wvalue = calc_wvalue(&index, p_str);
+
+    if ((wvalue >> 1) == 0) {
+      continue;
+    }
+
+    OPENSSL_memcpy(&t.a, &precomputed_table[i][(wvalue >> 1) - 1], sizeof(p.a));
+
+    if ((wvalue & 1) == 1) {
+      ecp_nistz256_neg(t.a.Y, t.a.Y);
+    }
+
+    ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a);
+  }
+
+  mul_p_add_and_store(group, r, g_scalar, p_, p_scalar, &t, &p);
 }
 
 static int ecp_nistz256_get_affine(const EC_GROUP *group,
@@ -513,6 +593,73 @@
   }
 }
 
+static int ecp_nistz256_mont_inv_mod_ord_vartime(const EC_GROUP *group,
+                                                 EC_SCALAR *out,
+                                                 const EC_SCALAR *in) {
+  if ((OPENSSL_ia32cap_P[1] & (1 << 28)) == 0) {
+    // No AVX support; fallback to generic code.
+    return ec_GFp_simple_mont_inv_mod_ord_vartime(group, out, in);
+  }
+
+  if (!beeu_mod_inverse_vartime(out->words, in->words, P256_ORDER)) {
+    return 0;
+  }
+
+  // The result should be returned in the Montgomery domain.
+  ec_scalar_to_montgomery(group, out, out);
+  return 1;
+}
+
+static int ecp_nistz256_cmp_x_coordinate(const EC_GROUP *group,
+                                         const EC_POINT *p, const BIGNUM *r,
+                                         BN_CTX *ctx) {
+  if (ec_GFp_simple_is_at_infinity(group, &p->raw)) {
+    OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
+    return 0;
+  }
+
+  BN_ULONG r_words[P256_LIMBS];
+  if (!bn_copy_words(r_words, P256_LIMBS, r)) {
+    return 0;
+  }
+
+  // We wish to compare X/Z^2 with r. This is equivalent to comparing X with
+  // r*Z^2. Note that X and Z are represented in Montgomery form, while r is
+  // not.
+  BN_ULONG r_Z2[P256_LIMBS], Z2_mont[P256_LIMBS], X[P256_LIMBS];
+  ecp_nistz256_mul_mont(Z2_mont, p->raw.Z.words, p->raw.Z.words);
+  ecp_nistz256_mul_mont(r_Z2, r_words, Z2_mont);
+  ecp_nistz256_from_mont(X, p->raw.X.words);
+
+  if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) {
+    return 1;
+  }
+
+  // During signing the x coefficient is reduced modulo the group order.
+  // Therefore there is a small possibility, less than 1/2^128, that group_order
+  // < p.x < P. in that case we need not only to compare against |r| but also to
+  // compare against r+group_order.
+
+  // P_MINUS_ORDER is the difference between the field order (p) and the group
+  // order (N). This value is not in the Montgomery domain.
+  static const BN_ULONG P_MINUS_ORDER[P256_LIMBS] = {
+      TOBN(0x0c46353d, 0x039cdaae), TOBN(0x43190553, 0x58e8617b),
+      TOBN(0x00000000, 0x00000000), TOBN(0x00000000, 0x00000000)};
+
+  if (bn_less_than_words(r_words, P_MINUS_ORDER, P256_LIMBS)) {
+    // We can add in-place, ignoring the carry, because: r + group_order < p <
+    // 2^256
+    bn_add_words(r_words, r_words, P256_ORDER, P256_LIMBS);
+    ecp_nistz256_mul_mont(r_Z2, r_words, Z2_mont);
+    if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) {
+      return 1;
+    }
+  }
+
+  OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
+  return 0;
+}
+
 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
   out->group_init = ec_GFp_mont_group_init;
   out->group_finish = ec_GFp_mont_group_finish;
@@ -521,12 +668,14 @@
   out->add = ecp_nistz256_add;
   out->dbl = ecp_nistz256_dbl;
   out->mul = ecp_nistz256_points_mul;
-  out->mul_public = ecp_nistz256_points_mul;
+  out->mul_public = ecp_nistz256_points_mul_public;
   out->felem_mul = ec_GFp_mont_felem_mul;
   out->felem_sqr = ec_GFp_mont_felem_sqr;
   out->bignum_to_felem = ec_GFp_mont_bignum_to_felem;
   out->felem_to_bignum = ec_GFp_mont_felem_to_bignum;
   out->scalar_inv_montgomery = ecp_nistz256_inv_mod_ord;
+  out->scalar_inv_montgomery_vartime = ecp_nistz256_mont_inv_mod_ord_vartime;
+  out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate;
 };
 
 #endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
diff --git a/crypto/fipsmodule/ec/p256-x86_64.h b/crypto/fipsmodule/ec/p256-x86_64.h
index 21b461c..9de3240 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.h
+++ b/crypto/fipsmodule/ec/p256-x86_64.h
@@ -61,6 +61,16 @@
   ecp_nistz256_mul_mont(res, in, ONE);
 }
 
+// ecp_nistz256_to_mont sets |res| to |in|, converted to Montgomery domain
+// by multiplying with RR = 2^512 mod P precomputed for NIST P256 curve.
+static inline void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS],
+                                        const BN_ULONG in[P256_LIMBS]) {
+  static const BN_ULONG RR[P256_LIMBS] = {
+      TOBN(0x00000000, 0x00000003), TOBN(0xfffffffb, 0xffffffff),
+      TOBN(0xffffffff, 0xfffffffe), TOBN(0x00000004, 0xfffffffd)};
+  ecp_nistz256_mul_mont(res, in, RR);
+}
+
 
 // P-256 scalar operations.
 //
@@ -79,6 +89,12 @@
 void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
                                const BN_ULONG a[P256_LIMBS], int rep);
 
+// beeu_mod_inverse_vartime sets out = a^-1 mod p using a Euclidean algorithm.
+// Assumption: 0 < a < p < 2^(256) and p is odd.
+int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
+                             const BN_ULONG a[P256_LIMBS],
+                             const BN_ULONG p[P256_LIMBS]);
+
 
 // P-256 point operations.
 //
diff --git a/crypto/fipsmodule/ec/p256-x86_64_test.cc b/crypto/fipsmodule/ec/p256-x86_64_test.cc
index 8ed1dd4..ab93dfb 100644
--- a/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/crypto/fipsmodule/ec/p256-x86_64_test.cc
@@ -24,8 +24,12 @@
 #include <gtest/gtest.h>
 
 #include <openssl/bn.h>
+#include <openssl/cpu.h>
+#include <openssl/ec.h>
 #include <openssl/mem.h>
+#include <openssl/nid.h>
 
+#include "internal.h"
 #include "../bn/internal.h"
 #include "../../internal.h"
 #include "../../test/file_test.h"
@@ -87,6 +91,64 @@
   }
 }
 
+TEST(P256_X86_64Test, BEEU) {
+  if ((OPENSSL_ia32cap_P[1] & (1 << 28)) == 0) {
+    // No AVX support; cannot run the BEEU code.
+    return;
+  }
+
+  bssl::UniquePtr<EC_GROUP> group(
+      EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1));
+  ASSERT_TRUE(group);
+
+  BN_ULONG order_words[P256_LIMBS];
+  ASSERT_TRUE(
+      bn_copy_words(order_words, P256_LIMBS, EC_GROUP_get0_order(group.get())));
+
+  BN_ULONG in[P256_LIMBS], out[P256_LIMBS];
+  EC_SCALAR in_scalar, out_scalar, result;
+  OPENSSL_memset(in, 0, sizeof(in));
+
+  // Trying to find the inverse of zero should fail.
+  ASSERT_FALSE(beeu_mod_inverse_vartime(out, in, order_words));
+
+  // kOneMont is 1, in Montgomery form.
+  static const BN_ULONG kOneMont[P256_LIMBS] = {
+      TOBN(0xc46353d, 0x039cdaaf), TOBN(0x43190552, 0x58e8617b),
+      0, 0xffffffff,
+  };
+
+  for (BN_ULONG i = 1; i < 2000; i++) {
+    SCOPED_TRACE(i);
+
+    in[0] = i;
+    if (i >= 1000) {
+      in[1] = i << 8;
+      in[2] = i << 32;
+      in[3] = i << 48;
+    } else {
+      in[1] = in[2] = in[3] = 0;
+    }
+
+    EXPECT_TRUE(bn_less_than_words(in, order_words, P256_LIMBS));
+    ASSERT_TRUE(beeu_mod_inverse_vartime(out, in, order_words));
+    EXPECT_TRUE(bn_less_than_words(out, order_words, P256_LIMBS));
+
+    // Calculate out*in and confirm that it equals one, modulo the order.
+    OPENSSL_memcpy(in_scalar.bytes, in, sizeof(in));
+    OPENSSL_memcpy(out_scalar.bytes, out, sizeof(out));
+    ec_scalar_to_montgomery(group.get(), &in_scalar, &in_scalar);
+    ec_scalar_to_montgomery(group.get(), &out_scalar, &out_scalar);
+    ec_scalar_mul_montgomery(group.get(), &result, &in_scalar, &out_scalar);
+
+    EXPECT_EQ(0, OPENSSL_memcmp(kOneMont, &result, sizeof(kOneMont)));
+
+    // Invert the result and expect to get back to the original value.
+    ASSERT_TRUE(beeu_mod_inverse_vartime(out, out, order_words));
+    EXPECT_EQ(0, OPENSSL_memcmp(in, out, sizeof(in)));
+  }
+}
+
 static bool GetFieldElement(FileTest *t, BN_ULONG out[P256_LIMBS],
                             const char *name) {
   std::vector<uint8_t> bytes;
diff --git a/crypto/fipsmodule/ec/scalar.c b/crypto/fipsmodule/ec/scalar.c
index 1bd6b02..88678a9 100644
--- a/crypto/fipsmodule/ec/scalar.c
+++ b/crypto/fipsmodule/ec/scalar.c
@@ -74,3 +74,8 @@
                               const EC_SCALAR *a) {
   group->meth->scalar_inv_montgomery(group, r, a);
 }
+
+int ec_scalar_inv_montgomery_vartime(const EC_GROUP *group, EC_SCALAR *r,
+                                     const EC_SCALAR *a) {
+  return group->meth->scalar_inv_montgomery_vartime(group, r, a);
+}
diff --git a/crypto/fipsmodule/ec/simple.c b/crypto/fipsmodule/ec/simple.c
index bf4aa4f..f875f22 100644
--- a/crypto/fipsmodule/ec/simple.c
+++ b/crypto/fipsmodule/ec/simple.c
@@ -354,3 +354,55 @@
   // The points are equal.
   return 0;
 }
+
+int ec_GFp_simple_mont_inv_mod_ord_vartime(const EC_GROUP *group,
+                                           EC_SCALAR *out,
+                                           const EC_SCALAR *in) {
+  // This implementation (in fact) runs in constant time,
+  // even though for this interface it is not mandatory.
+
+  // out = in^-1 in the Montgomery domain. This is
+  // |ec_scalar_to_montgomery| followed by |ec_scalar_inv_montgomery|, but
+  // |ec_scalar_inv_montgomery| followed by |ec_scalar_from_montgomery| is
+  // equivalent and slightly more efficient.
+  ec_scalar_inv_montgomery(group, out, in);
+  ec_scalar_from_montgomery(group, out, out);
+  return 1;
+}
+
+// Compares the x (affine) coordinate of the point p with x.
+// Return 1 on success 0 otherwise
+// Assumption: the caller starts the BN_CTX.
+int ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p,
+                                   const BIGNUM *r, BN_CTX *ctx) {
+  int ret = 0;
+  BN_CTX_start(ctx);
+
+  BIGNUM *X = BN_CTX_get(ctx);
+  if (X == NULL) {
+    OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB);
+    goto out;
+  }
+
+  if (!EC_POINT_get_affine_coordinates_GFp(group, p, X, NULL, ctx)) {
+    OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB);
+    goto out;
+  }
+
+  if (!ec_field_element_to_scalar(group, X)) {
+    OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB);
+    goto out;
+  }
+
+  // The signature is correct iff |X| is equal to |r|.
+  if (BN_ucmp(X, r) != 0) {
+    OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
+    goto out;
+  }
+
+  ret = 1;
+
+out:
+  BN_CTX_end(ctx);
+  return ret;
+}
diff --git a/crypto/fipsmodule/ecdsa/ecdsa.c b/crypto/fipsmodule/ecdsa/ecdsa.c
index f3ce214..7ec4565 100644
--- a/crypto/fipsmodule/ecdsa/ecdsa.c
+++ b/crypto/fipsmodule/ecdsa/ecdsa.c
@@ -98,40 +98,6 @@
                           order->width);
 }
 
-// field_element_to_scalar reduces |r| modulo |group->order|. |r| must
-// previously have been reduced modulo |group->field|.
-static int field_element_to_scalar(const EC_GROUP *group, BIGNUM *r) {
-  // We must have p < 2×order, assuming p is not tiny (p >= 17). Thus rather we
-  // can reduce by performing at most one subtraction.
-  //
-  // Proof: We only work with prime order curves, so the number of points on
-  // the curve is the order. Thus Hasse's theorem gives:
-  //
-  //     |order - (p + 1)| <= 2×sqrt(p)
-  //         p + 1 - order <= 2×sqrt(p)
-  //     p + 1 - 2×sqrt(p) <= order
-  //       p + 1 - 2×(p/4)  < order       (p/4 > sqrt(p) for p >= 17)
-  //         p/2 < p/2 + 1  < order
-  //                     p  < 2×order
-  //
-  // Additionally, one can manually check this property for built-in curves. It
-  // is enforced for legacy custom curves in |EC_GROUP_set_generator|.
-  //
-  // TODO(davidben): Introduce |EC_FIELD_ELEMENT|, make this a function from
-  // |EC_FIELD_ELEMENT| to |EC_SCALAR|, and cut out the |BIGNUM|. Does this need
-  // to be constant-time for signing? |r| is the x-coordinate for kG, which is
-  // public unless k was rerolled because |s| was zero.
-  assert(!BN_is_negative(r));
-  assert(BN_cmp(r, &group->field) < 0);
-  if (BN_cmp(r, &group->order) >= 0 &&
-      !BN_sub(r, r, &group->order)) {
-    return 0;
-  }
-  assert(!BN_is_negative(r));
-  assert(BN_cmp(r, &group->order) < 0);
-  return 1;
-}
-
 ECDSA_SIG *ECDSA_SIG_new(void) {
   ECDSA_SIG *sig = OPENSSL_malloc(sizeof(ECDSA_SIG));
   if (sig == NULL) {
@@ -193,12 +159,6 @@
   }
   int ret = 0;
   EC_POINT *point = NULL;
-  BN_CTX_start(ctx);
-  BIGNUM *X = BN_CTX_get(ctx);
-  if (X == NULL) {
-    OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB);
-    goto err;
-  }
 
   EC_SCALAR r, s, u1, u2, s_inv_mont, m;
   if (BN_is_zero(sig->r) ||
@@ -210,11 +170,7 @@
   }
 
   // s_inv_mont = s^-1 in the Montgomery domain. This is
-  // |ec_scalar_to_montgomery| followed by |ec_scalar_inv_montgomery|, but
-  // |ec_scalar_inv_montgomery| followed by |ec_scalar_from_montgomery| is
-  // equivalent and slightly more efficient.
-  ec_scalar_inv_montgomery(group, &s_inv_mont, &s);
-  ec_scalar_from_montgomery(group, &s_inv_mont, &s_inv_mont);
+  ec_scalar_inv_montgomery_vartime(group, &s_inv_mont, &s);
 
   // u1 = m * s^-1 mod order
   // u2 = r * s^-1 mod order
@@ -234,24 +190,10 @@
     OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB);
     goto err;
   }
-  if (!EC_POINT_get_affine_coordinates_GFp(group, point, X, NULL, ctx)) {
-    OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB);
-    goto err;
-  }
-  if (!field_element_to_scalar(group, X)) {
-    OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB);
-    goto err;
-  }
-  // The signature is correct iff |X| is equal to |sig->r|.
-  if (BN_ucmp(X, sig->r) != 0) {
-    OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
-    goto err;
-  }
 
-  ret = 1;
+  ret = ec_cmp_x_coordinate(group, point, sig->r, ctx);
 
 err:
-  BN_CTX_end(ctx);
   BN_CTX_free(ctx);
   EC_POINT_free(point);
   return ret;
@@ -320,7 +262,7 @@
       goto err;
     }
 
-    if (!field_element_to_scalar(group, r)) {
+    if (!ec_field_element_to_scalar(group, r)) {
       goto err;
     }
   } while (BN_is_zero(r));
diff --git a/third_party/fiat/p256.c b/third_party/fiat/p256.c
index 2d40ea0..b6f3f49 100644
--- a/third_party/fiat/p256.c
+++ b/third_party/fiat/p256.c
@@ -1836,6 +1836,8 @@
   out->bignum_to_felem = ec_GFp_mont_bignum_to_felem;
   out->felem_to_bignum = ec_GFp_mont_felem_to_bignum;
   out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
+  out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
+  out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
 };
 
 #undef BORINGSSL_NISTP256_64BIT