Revert "Speed up ECDSA verify on x86-64." This reverts commit 3d450d2844db825a906fc19f7bb1e6ce765047db. It fails SDE, looks like a missing CPUID check before using vector instructions. Change-Id: I6b7dd71d9e5b1f509d2e018bd8be38c973476b4e Reviewed-on: https://boringssl-review.googlesource.com/c/32864 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index 463febb..e6c8cc6 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt
@@ -11,7 +11,6 @@ ghash-x86_64.${ASM_EXT} md5-x86_64.${ASM_EXT} p256-x86_64-asm.${ASM_EXT} - p256_beeu-x86_64-asm.${ASM_EXT} rdrand-x86_64.${ASM_EXT} rsaz-avx2.${ASM_EXT} sha1-x86_64.${ASM_EXT} @@ -101,7 +100,6 @@ perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl) perlasm(md5-x86_64.${ASM_EXT} md5/asm/md5-x86_64.pl) perlasm(p256-x86_64-asm.${ASM_EXT} ec/asm/p256-x86_64-asm.pl) -perlasm(p256_beeu-x86_64-asm.${ASM_EXT} ec/asm/p256_beeu-x86_64-asm.pl) perlasm(rdrand-x86_64.${ASM_EXT} rand/asm/rdrand-x86_64.pl) perlasm(rsaz-avx2.${ASM_EXT} bn/asm/rsaz-avx2.pl) perlasm(sha1-586.${ASM_EXT} sha/asm/sha1-586.pl)
diff --git a/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl b/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl deleted file mode 100644 index 12b9f5a..0000000 --- a/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl +++ /dev/null
@@ -1,405 +0,0 @@ -# Copyright (c) 2018, Amazon Inc. -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -# -# Written by Nir Drucker, and Shay Gueron -# AWS Cryptographic Algorithms Group -# (ndrucker@amazon.com, gueron@amazon.com) -# based on BN_mod_inverse_odd - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; -*STDOUT=*OUT; - -############################################################################# -# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS], -# BN_ULONG a[P256_LIMBS], -# BN_ULONG n[P256_LIMBS]); -# -# (Binary Extended Euclidean Algorithm. -# See https://en.wikipedia.org/wiki/Binary_GCD_algorithm) -# -# Assumption 1: n is odd for the BEEU -# Assumption 2: 1 < a < n < 2^256 - -$out = "%rdi"; -$a = "%rsi"; -$n = "%rdx"; - -# X/Y will hold the inverse parameter -# Assumption: X,Y<2^(256) -$x0 = "%r8"; -$x1 = "%r9"; -$x2 = "%r10"; -$x3 = "%r11"; -# borrow from out (out is needed only at the end) -$x4 = "%rdi"; -$y0 = "%r12"; -$y1 = "%r13"; -$y2 = "%r14"; -$y3 = "%r15"; -$y4 = "%rbp"; -$shift = "%rcx"; -$t0 = "%rax"; -$t1 = "%rbx"; -$t2 = "%rsi"; -# borrow -$t3 = "%rcx"; - -$T0 = "%xmm0"; -$T1 = "%xmm1"; - -# Offsets on the stack -$out_rsp = 0; -$shift_rsp = $out_rsp+0x8; -$a_rsp0 = $shift_rsp+0x8; -$a_rsp1 = $a_rsp0+0x8; -$a_rsp2 = $a_rsp1+0x8; -$a_rsp3 = $a_rsp2+0x8; -$b_rsp0 = $a_rsp3+0x8; -$b_rsp1 = $b_rsp0+0x8; -$b_rsp2 = $b_rsp1+0x8; -$b_rsp3 = $b_rsp2+0x8; - -# Borrow when a_rsp/b_rsp are no longer needed. -$y_rsp0 = $a_rsp0; -$y_rsp1 = $y_rsp0+0x8; -$y_rsp2 = $y_rsp1+0x8; -$y_rsp3 = $y_rsp2+0x8; -$y_rsp4 = $y_rsp3+0x8; -$last_rsp_offset = $b_rsp3+0x8; - -sub TEST_B_ZERO { - return <<___; - xorq $t1, $t1 - or $b_rsp0(%rsp), $t1 - or $b_rsp1(%rsp), $t1 - or $b_rsp2(%rsp), $t1 - or $b_rsp3(%rsp), $t1 - jz .Lbeeu_loop_end -___ -} - -$g_next_label = 0; - -sub SHIFT1 { - my ($var0, $var1, $var2, $var3, $var4) = @_; - my $label = ".Lshift1_${g_next_label}"; - $g_next_label++; - - return <<___; - # Ensure X is even and divide by two. - movq \$1, $t1 - andq $var0, $t1 - jz $label - add 0*8($n), $var0 - adc 1*8($n), $var1 - adc 2*8($n), $var2 - adc 3*8($n), $var3 - adc \$0, $var4 - -$label: - shrdq \$1, $var1, $var0 - shrdq \$1, $var2, $var1 - shrdq \$1, $var3, $var2 - shrdq \$1, $var4, $var3 - shrq \$1, $var4 -___ -} - -sub SHIFT256 { - my ($var) = @_; - return <<___; - # Copy shifted values. - # Remember not to override t3=rcx - movq 1*8+$var(%rsp), $t0 - movq 2*8+$var(%rsp), $t1 - movq 3*8+$var(%rsp), $t2 - - shrdq %cl, $t0, 0*8+$var(%rsp) - shrdq %cl, $t1, 1*8+$var(%rsp) - shrdq %cl, $t2, 2*8+$var(%rsp) - - shrq %cl, $t2 - mov $t2, 3*8+$var(%rsp) -___ -} - -$code.=<<___; -.text - -.type beeu_mod_inverse_vartime,\@function -.hidden beeu_mod_inverse_vartime -.globl beeu_mod_inverse_vartime -.align 32 -beeu_mod_inverse_vartime: -.cfi_startproc - push %rbp -.cfi_push rbp - movq %rsp, %rbp -.cfi_def_cfa_register rbp - - push %r12 -.cfi_push r12 - push %r13 -.cfi_push r13 - push %r14 -.cfi_push r14 - push %r15 -.cfi_push r15 - push %rbx -.cfi_push rbx - push %rsi -.cfi_push rsi - - sub \$$last_rsp_offset, %rsp - movq $out, $out_rsp(%rsp) - - # X=1, Y=0 - movq \$1, $x0 - xorq $x1, $x1 - xorq $x2, $x2 - xorq $x3, $x3 - xorq $x4, $x4 - - xorq $y0, $y0 - xorq $y1, $y1 - xorq $y2, $y2 - xorq $y3, $y3 - xorq $y4, $y4 - - # Copy a/n into B/A on the stack. - vmovdqu 0*8($a), $T0 - vmovdqu 2*8($a), $T1 - vmovdqu $T0, $b_rsp0(%rsp) - vmovdqu $T1, $b_rsp2(%rsp) - - vmovdqu 0*8($n), $T0 - vmovdqu 2*8($n), $T1 - vmovdqu $T0, $a_rsp0(%rsp) - vmovdqu $T1, $a_rsp2(%rsp) - -.Lbeeu_loop: - ${\TEST_B_ZERO} - - # 0 < B < |n|, - # 0 < A <= |n|, - # (1) X*a == B (mod |n|), - # (2) (-1)*Y*a == A (mod |n|) - - # Now divide B by the maximum possible power of two in the - # integers, and divide X by the same value mod |n|. When we're - # done, (1) still holds. - movq \$1, $shift - - # Note that B > 0 -.Lbeeu_shift_loop_XB: - movq $shift, $t1 - andq $b_rsp0(%rsp), $t1 - jnz .Lbeeu_shift_loop_end_XB - - ${\SHIFT1($x0, $x1, $x2, $x3, $x4)} - shl \$1, $shift - - # Test wraparound of the shift parameter. The probability to have 32 zeroes - # in a row is small Therefore having the value below equal \$0x8000000 or - # \$0x8000 does not affect the performance. We choose 0x8000000 because it - # is the maximal immediate value possible. - cmp \$0x8000000, $shift - jne .Lbeeu_shift_loop_XB - -.Lbeeu_shift_loop_end_XB: - bsf $shift, $shift - test $shift, $shift - jz .Lbeeu_no_shift_XB - - ${\SHIFT256($b_rsp0)} - -.Lbeeu_no_shift_XB: - # Same for A and Y. Afterwards, (2) still holds. - movq \$1, $shift - - # Note that A > 0 -.Lbeeu_shift_loop_YA: - movq $shift, $t1 - andq $a_rsp0(%rsp), $t1 - jnz .Lbeeu_shift_loop_end_YA - - ${\SHIFT1($y0, $y1, $y2, $y3, $y4)} - shl \$1, $shift - - # Test wraparound of the shift parameter. The probability to have 32 zeroes - # in a row is small therefore having the value below equal \$0x8000000 or - # \$0x8000 Does not affect the performance. We choose 0x8000000 because it - # is the maximal immediate value possible. - cmp \$0x8000000, $shift - jne .Lbeeu_shift_loop_YA - -.Lbeeu_shift_loop_end_YA: - bsf $shift, $shift - test $shift, $shift - jz .Lbeeu_no_shift_YA - - ${\SHIFT256($a_rsp0)} - -.Lbeeu_no_shift_YA: - # T = B-A (A,B < 2^256) - mov $b_rsp0(%rsp), $t0 - mov $b_rsp1(%rsp), $t1 - mov $b_rsp2(%rsp), $t2 - mov $b_rsp3(%rsp), $t3 - sub $a_rsp0(%rsp), $t0 - sbb $a_rsp1(%rsp), $t1 - sbb $a_rsp2(%rsp), $t2 - sbb $a_rsp3(%rsp), $t3 # borrow from shift - jnc .Lbeeu_B_bigger_than_A - - # A = A - B - mov $a_rsp0(%rsp), $t0 - mov $a_rsp1(%rsp), $t1 - mov $a_rsp2(%rsp), $t2 - mov $a_rsp3(%rsp), $t3 - sub $b_rsp0(%rsp), $t0 - sbb $b_rsp1(%rsp), $t1 - sbb $b_rsp2(%rsp), $t2 - sbb $b_rsp3(%rsp), $t3 - mov $t0, $a_rsp0(%rsp) - mov $t1, $a_rsp1(%rsp) - mov $t2, $a_rsp2(%rsp) - mov $t3, $a_rsp3(%rsp) - - # Y = Y + X - add $x0, $y0 - adc $x1, $y1 - adc $x2, $y2 - adc $x3, $y3 - adc $x4, $y4 - jmp .Lbeeu_loop - -.Lbeeu_B_bigger_than_A: - # B = T = B - A - mov $t0, $b_rsp0(%rsp) - mov $t1, $b_rsp1(%rsp) - mov $t2, $b_rsp2(%rsp) - mov $t3, $b_rsp3(%rsp) - - # X = Y + X - add $y0, $x0 - adc $y1, $x1 - adc $y2, $x2 - adc $y3, $x3 - adc $y4, $x4 - - jmp .Lbeeu_loop - -.Lbeeu_loop_end: - # The Euclid's algorithm loop ends when A == beeu(a,n); - # Therefore (-1)*Y*a == A (mod |n|), Y>0 - - # Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) - mov $a_rsp0(%rsp), $t1 - sub \$1, $t1 - or $a_rsp1(%rsp), $t1 - or $a_rsp2(%rsp), $t1 - or $a_rsp3(%rsp), $t1 - # If not, fail. - jnz .Lbeeu_err - - # From this point on, we no longer need X - # Therefore we use it as a temporary storage. - # X = n - movq 0*8($n), $x0 - movq 1*8($n), $x1 - movq 2*8($n), $x2 - movq 3*8($n), $x3 - xorq $x4, $x4 - -.Lbeeu_reduction_loop: - movq $y0, $y_rsp0(%rsp) - movq $y1, $y_rsp1(%rsp) - movq $y2, $y_rsp2(%rsp) - movq $y3, $y_rsp3(%rsp) - movq $y4, $y_rsp4(%rsp) - - # If Y>n ==> Y=Y-n - sub $x0, $y0 - sbb $x1, $y1 - sbb $x2, $y2 - sbb $x3, $y3 - sbb \$0, $y4 - - # Choose old Y or new Y - cmovc $y_rsp0(%rsp), $y0 - cmovc $y_rsp1(%rsp), $y1 - cmovc $y_rsp2(%rsp), $y2 - cmovc $y_rsp3(%rsp), $y3 - jnc .Lbeeu_reduction_loop - - # X = n - Y (n, Y < 2^256), (Cancel the (-1)) - sub $y0, $x0 - sbb $y1, $x1 - sbb $y2, $x2 - sbb $y3, $x3 - -.Lbeeu_save: - # Save the inverse(<2^256) to out. - mov $out_rsp(%rsp), $out - - movq $x0, 0*8($out) - movq $x1, 1*8($out) - movq $x2, 2*8($out) - movq $x3, 3*8($out) - - # Return 1. - movq \$1, %rax - jmp .Lbeeu_finish - -.Lbeeu_err: - # Return 0. - xorq %rax, %rax - -.Lbeeu_finish: - add \$$last_rsp_offset, %rsp - pop %rsi -.cfi_pop rsi - pop %rbx -.cfi_pop rbx - pop %r15 -.cfi_pop r15 - pop %r14 -.cfi_pop r14 - pop %r13 -.cfi_pop r13 - pop %r12 -.cfi_pop r12 - pop %rbp -.cfi_pop rbp -.cfi_def_cfa rsp, 8 -.cfi_endproc - ret - -.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime -___ - -print $code; -close STDOUT;
diff --git a/crypto/fipsmodule/ec/ec.c b/crypto/fipsmodule/ec/ec.c index 4c557a1..134d051 100644 --- a/crypto/fipsmodule/ec/ec.c +++ b/crypto/fipsmodule/ec/ec.c
@@ -911,43 +911,6 @@ return 1; } -int ec_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p, - const BIGNUM *r, BN_CTX *ctx) { - return group->meth->cmp_x_coordinate(group, p, r, ctx); -} - -int ec_field_element_to_scalar(const EC_GROUP *group, BIGNUM *r) { - // We must have p < 2×order, assuming p is not tiny (p >= 17). Thus rather we - // can reduce by performing at most one subtraction. - // - // Proof: We only work with prime order curves, so the number of points on - // the curve is the order. Thus Hasse's theorem gives: - // - // |order - (p + 1)| <= 2×sqrt(p) - // p + 1 - order <= 2×sqrt(p) - // p + 1 - 2×sqrt(p) <= order - // p + 1 - 2×(p/4) < order (p/4 > sqrt(p) for p >= 17) - // p/2 < p/2 + 1 < order - // p < 2×order - // - // Additionally, one can manually check this property for built-in curves. It - // is enforced for legacy custom curves in |EC_GROUP_set_generator|. - // - // TODO(davidben): Introduce |EC_FIELD_ELEMENT|, make this a function from - // |EC_FIELD_ELEMENT| to |EC_SCALAR|, and cut out the |BIGNUM|. Does this need - // to be constant-time for signing? |r| is the x-coordinate for kG, which is - // public unless k was rerolled because |s| was zero. - assert(!BN_is_negative(r)); - assert(BN_cmp(r, &group->field) < 0); - if (BN_cmp(r, &group->order) >= 0 && - !BN_sub(r, r, &group->order)) { - return 0; - } - assert(!BN_is_negative(r)); - assert(BN_cmp(r, &group->order) < 0); - return 1; -} - void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag) {} const EC_METHOD *EC_GROUP_method_of(const EC_GROUP *group) {
diff --git a/crypto/fipsmodule/ec/ec_montgomery.c b/crypto/fipsmodule/ec/ec_montgomery.c index db2d657..3fb67e2 100644 --- a/crypto/fipsmodule/ec/ec_montgomery.c +++ b/crypto/fipsmodule/ec/ec_montgomery.c
@@ -234,6 +234,4 @@ out->bignum_to_felem = ec_GFp_mont_bignum_to_felem; out->felem_to_bignum = ec_GFp_mont_felem_to_bignum; out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery; - out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime; - out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate; }
diff --git a/crypto/fipsmodule/ec/internal.h b/crypto/fipsmodule/ec/internal.h index 4f58cb9..e51109a 100644 --- a/crypto/fipsmodule/ec/internal.h +++ b/crypto/fipsmodule/ec/internal.h
@@ -167,21 +167,11 @@ int (*felem_to_bignum)(const EC_GROUP *group, BIGNUM *out, const EC_FELEM *in); - // scalar_inv_montgomery sets |out| to |in|^-1, where both input and output - // are in Montgomery form. + // scalar_inv_mont sets |out| to |in|^-1, where both input and output are in + // Montgomery form. void (*scalar_inv_montgomery)(const EC_GROUP *group, EC_SCALAR *out, const EC_SCALAR *in); - // scalar_inv_montgomery_vartime performs the same computation as - // |scalar_inv_montgomery|. It further assumes that the inputs are public so - // there is no concern about leaking their values through timing. - int (*scalar_inv_montgomery_vartime)(const EC_GROUP *group, EC_SCALAR *out, - const EC_SCALAR *in); - - // cmp_x_coordinate compares the x (affine) coordinate of |p|, mod the group - // order, with |r|. It returns one if they are equal and zero otherwise. - int (*cmp_x_coordinate)(const EC_GROUP *group, const EC_POINT *p, - const BIGNUM *r, BN_CTX *ctx); } /* EC_METHOD */; const EC_METHOD *EC_GFp_mont_method(void); @@ -287,11 +277,6 @@ void ec_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a); -// ec_scalar_inv_montgomery_vartime performs the same actions as -// |ec_scalar_inv_montgomery|, but in variable time. -int ec_scalar_inv_montgomery_vartime(const EC_GROUP *group, EC_SCALAR *r, - const EC_SCALAR *a); - // ec_point_mul_scalar sets |r| to generator * |g_scalar| + |p| * // |p_scalar|. Unlike other functions which take |EC_SCALAR|, |g_scalar| and // |p_scalar| need not be fully reduced. They need only contain as many bits as @@ -307,16 +292,6 @@ const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar, const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx); -// ec_cmp_x_coordinate compares the x (affine) coordinate of |p| with |r|. It -// returns one if they are equal and zero otherwise. The |ctx| must have been -// started by the caller. -int ec_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p, - const BIGNUM *r, BN_CTX *ctx); - -// ec_field_element_to_scalar reduces |r| modulo |group->order|. |r| must -// previously have been reduced modulo |group->field|. -int ec_field_element_to_scalar(const EC_GROUP *group, BIGNUM *r); - void ec_GFp_simple_mul(const EC_GROUP *group, EC_RAW_POINT *r, const EC_SCALAR *g_scalar, const EC_RAW_POINT *p, const EC_SCALAR *p_scalar); @@ -362,14 +337,6 @@ void ec_simple_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a); -int ec_GFp_simple_mont_inv_mod_ord_vartime(const EC_GROUP *group, EC_SCALAR *r, - const EC_SCALAR *a); - -// ec_GFp_simple_cmp_x_coordinate compares the x (affine) coordinate of |p|, mod -// the group order, with |r|. It returns one on success or zero otherwise. -int ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p, - const BIGNUM *r, BN_CTX *ctx); - // method functions in montgomery.c int ec_GFp_mont_group_init(EC_GROUP *); int ec_GFp_mont_group_set_curve(EC_GROUP *, const BIGNUM *p, const BIGNUM *a,
diff --git a/crypto/fipsmodule/ec/p224-64.c b/crypto/fipsmodule/ec/p224-64.c index 27bfc36..ebaca85 100644 --- a/crypto/fipsmodule/ec/p224-64.c +++ b/crypto/fipsmodule/ec/p224-64.c
@@ -1136,8 +1136,6 @@ out->bignum_to_felem = ec_GFp_nistp224_bignum_to_felem; out->felem_to_bignum = ec_GFp_nistp224_felem_to_bignum; out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery; - out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime; - out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate; }; #endif // BORINGSSL_HAS_UINT128 && !SMALL
diff --git a/crypto/fipsmodule/ec/p256-x86_64.c b/crypto/fipsmodule/ec/p256-x86_64.c index 0ff5455..73d36b6 100644 --- a/crypto/fipsmodule/ec/p256-x86_64.c +++ b/crypto/fipsmodule/ec/p256-x86_64.c
@@ -44,12 +44,6 @@ TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe), }; -// P256_ORDER is the order of the P-256 group, not in Montgomery form. -static const BN_ULONG P256_ORDER[P256_LIMBS] = { - TOBN(0xf3b9cac2, 0xfc632551), TOBN(0xbce6faad, 0xa7179e84), - TOBN(0xffffffff, 0xffffffff), TOBN(0xffffffff, 0x00000000), -}; - // Precomputed tables for the default generator #include "p256-x86_64-table.h" @@ -295,63 +289,19 @@ ecp_nistz256_point_add(r, r, &h); } -typedef union { - P256_POINT p; - P256_POINT_AFFINE a; -} p256_point_union_t; - -static unsigned calc_first_wvalue(unsigned *index, const uint8_t p_str[33]) { - static const unsigned kWindowSize = 7; - static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; - *index = kWindowSize; - - unsigned wvalue = (p_str[0] << 1) & kMask; - return booth_recode_w7(wvalue); -} - -static unsigned calc_wvalue(unsigned *index, const uint8_t p_str[33]) { - static const unsigned kWindowSize = 7; - static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; - - const unsigned off = (*index - 1) / 8; - unsigned wvalue = p_str[off] | p_str[off + 1] << 8; - wvalue = (wvalue >> ((*index - 1) % 8)) & kMask; - *index += kWindowSize; - - return booth_recode_w7(wvalue); -} - -static void mul_p_add_and_store(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_SCALAR *g_scalar, - const EC_RAW_POINT *p_, - const EC_SCALAR *p_scalar, - p256_point_union_t *t, p256_point_union_t *p) { - const int p_is_infinity = g_scalar == NULL; - if (p_scalar != NULL) { - P256_POINT *out = &t->p; - if (p_is_infinity) { - out = &p->p; - } - - ecp_nistz256_windowed_mul(group, out, p_, p_scalar); - if (!p_is_infinity) { - ecp_nistz256_point_add(&p->p, &p->p, out); - } - } - - assert(group->field.width == P256_LIMBS); - OPENSSL_memcpy(r->X.words, p->p.X, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Y.words, p->p.Y, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Z.words, p->p.Z, P256_LIMBS * sizeof(BN_ULONG)); -} - static void ecp_nistz256_points_mul(const EC_GROUP *group, EC_RAW_POINT *r, const EC_SCALAR *g_scalar, const EC_RAW_POINT *p_, const EC_SCALAR *p_scalar) { assert((p_ != NULL) == (p_scalar != NULL)); - alignas(32) p256_point_union_t t, p; + static const unsigned kWindowSize = 7; + static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; + + alignas(32) union { + P256_POINT p; + P256_POINT_AFFINE a; + } t, p; if (g_scalar != NULL) { uint8_t p_str[33]; @@ -359,8 +309,10 @@ p_str[32] = 0; // First window - unsigned index = 0; - unsigned wvalue = calc_first_wvalue(&index, p_str); + unsigned wvalue = (p_str[0] << 1) & kMask; + unsigned index = kWindowSize; + + wvalue = booth_recode_w7(wvalue); const PRECOMP256_ROW *const precomputed_table = (const PRECOMP256_ROW *)ecp_nistz256_precomputed; @@ -376,7 +328,12 @@ copy_conditional(p.p.Z, ONE, is_not_zero(wvalue >> 1)); for (int i = 1; i < 37; i++) { - wvalue = calc_wvalue(&index, p_str); + unsigned off = (index - 1) / 8; + wvalue = p_str[off] | p_str[off + 1] << 8; + wvalue = (wvalue >> ((index - 1) % 8)) & kMask; + index += kWindowSize; + + wvalue = booth_recode_w7(wvalue); ecp_nistz256_select_w7(&t.a, precomputed_table[i], wvalue >> 1); @@ -387,60 +344,23 @@ } } - mul_p_add_and_store(group, r, g_scalar, p_, p_scalar, &t, &p); -} - -static void ecp_nistz256_points_mul_public(const EC_GROUP *group, - EC_RAW_POINT *r, - const EC_SCALAR *g_scalar, - const EC_RAW_POINT *p_, - const EC_SCALAR *p_scalar) { - assert(p_ != NULL && p_scalar != NULL && g_scalar != NULL); - - alignas(32) p256_point_union_t t, p; - uint8_t p_str[33]; - OPENSSL_memcpy(p_str, g_scalar->bytes, 32); - p_str[32] = 0; - - // First window - unsigned index = 0; - unsigned wvalue = calc_first_wvalue(&index, p_str); - - const PRECOMP256_ROW *const precomputed_table = - (const PRECOMP256_ROW *)ecp_nistz256_precomputed; - - // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p| - // is infinity and |ONE| otherwise. |p| was computed from the table, so it - // is infinity iff |wvalue >> 1| is zero. - if ((wvalue >> 1) != 0) { - OPENSSL_memcpy(&p.a, &precomputed_table[0][(wvalue >> 1) - 1], sizeof(p.a)); - OPENSSL_memcpy(&p.p.Z, ONE, sizeof(p.p.Z)); - } else { - OPENSSL_memset(&p.a, 0, sizeof(p.a)); - OPENSSL_memset(p.p.Z, 0, sizeof(p.p.Z)); - } - - if ((wvalue & 1) == 1) { - ecp_nistz256_neg(p.p.Y, p.p.Y); - } - - for (int i = 1; i < 37; i++) { - wvalue = calc_wvalue(&index, p_str); - - if ((wvalue >> 1) == 0) { - continue; + const int p_is_infinity = g_scalar == NULL; + if (p_scalar != NULL) { + P256_POINT *out = &t.p; + if (p_is_infinity) { + out = &p.p; } - OPENSSL_memcpy(&t.a, &precomputed_table[i][(wvalue >> 1) - 1], sizeof(p.a)); - - if ((wvalue & 1) == 1) { - ecp_nistz256_neg(t.a.Y, t.a.Y); + ecp_nistz256_windowed_mul(group, out, p_, p_scalar); + if (!p_is_infinity) { + ecp_nistz256_point_add(&p.p, &p.p, out); } - - ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a); } - mul_p_add_and_store(group, r, g_scalar, p_, p_scalar, &t, &p); + assert(group->field.width == P256_LIMBS); + OPENSSL_memcpy(r->X.words, p.p.X, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Y.words, p.p.Y, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Z.words, p.p.Z, P256_LIMBS * sizeof(BN_ULONG)); } static int ecp_nistz256_get_affine(const EC_GROUP *group, @@ -593,68 +513,6 @@ } } -static int ecp_nistz256_mont_inv_mod_ord_vartime(const EC_GROUP *group, - EC_SCALAR *out, - const EC_SCALAR *in) { - if (!beeu_mod_inverse_vartime(out->words, in->words, P256_ORDER)) { - return 0; - } - - // The result should be returned in the Montgomery domain. - ec_scalar_to_montgomery(group, out, out); - return 1; -} - -static int ecp_nistz256_cmp_x_coordinate(const EC_GROUP *group, - const EC_POINT *p, const BIGNUM *r, - BN_CTX *ctx) { - if (ec_GFp_simple_is_at_infinity(group, &p->raw)) { - OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); - return 0; - } - - BN_ULONG r_words[P256_LIMBS]; - if (!bn_copy_words(r_words, P256_LIMBS, r)) { - return 0; - } - - // We wish to compare X/Z^2 with r. This is equivalent to comparing X with - // r*Z^2. Note that X and Z are represented in Montgomery form, while r is - // not. - BN_ULONG r_Z2[P256_LIMBS], Z2_mont[P256_LIMBS], X[P256_LIMBS]; - ecp_nistz256_mul_mont(Z2_mont, p->raw.Z.words, p->raw.Z.words); - ecp_nistz256_mul_mont(r_Z2, r_words, Z2_mont); - ecp_nistz256_from_mont(X, p->raw.X.words); - - if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) { - return 1; - } - - // During signing the x coefficient is reduced modulo the group order. - // Therefore there is a small possibility, less than 1/2^128, that group_order - // < p.x < P. in that case we need not only to compare against |r| but also to - // compare against r+group_order. - - // P_MINUS_ORDER is the difference between the field order (p) and the group - // order (N). This value is not in the Montgomery domain. - static const BN_ULONG P_MINUS_ORDER[P256_LIMBS] = { - TOBN(0x0c46353d, 0x039cdaae), TOBN(0x43190553, 0x58e8617b), - TOBN(0x00000000, 0x00000000), TOBN(0x00000000, 0x00000000)}; - - if (bn_less_than_words(r_words, P_MINUS_ORDER, P256_LIMBS)) { - // We can add in-place, ignoring the carry, because: r + group_order < p < - // 2^256 - bn_add_words(r_words, r_words, P256_ORDER, P256_LIMBS); - ecp_nistz256_mul_mont(r_Z2, r_words, Z2_mont); - if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) { - return 1; - } - } - - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); - return 0; -} - DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) { out->group_init = ec_GFp_mont_group_init; out->group_finish = ec_GFp_mont_group_finish; @@ -663,14 +521,12 @@ out->add = ecp_nistz256_add; out->dbl = ecp_nistz256_dbl; out->mul = ecp_nistz256_points_mul; - out->mul_public = ecp_nistz256_points_mul_public; + out->mul_public = ecp_nistz256_points_mul; out->felem_mul = ec_GFp_mont_felem_mul; out->felem_sqr = ec_GFp_mont_felem_sqr; out->bignum_to_felem = ec_GFp_mont_bignum_to_felem; out->felem_to_bignum = ec_GFp_mont_felem_to_bignum; out->scalar_inv_montgomery = ecp_nistz256_inv_mod_ord; - out->scalar_inv_montgomery_vartime = ecp_nistz256_mont_inv_mod_ord_vartime; - out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate; }; #endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
diff --git a/crypto/fipsmodule/ec/p256-x86_64.h b/crypto/fipsmodule/ec/p256-x86_64.h index 9de3240..21b461c 100644 --- a/crypto/fipsmodule/ec/p256-x86_64.h +++ b/crypto/fipsmodule/ec/p256-x86_64.h
@@ -61,16 +61,6 @@ ecp_nistz256_mul_mont(res, in, ONE); } -// ecp_nistz256_to_mont sets |res| to |in|, converted to Montgomery domain -// by multiplying with RR = 2^512 mod P precomputed for NIST P256 curve. -static inline void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS], - const BN_ULONG in[P256_LIMBS]) { - static const BN_ULONG RR[P256_LIMBS] = { - TOBN(0x00000000, 0x00000003), TOBN(0xfffffffb, 0xffffffff), - TOBN(0xffffffff, 0xfffffffe), TOBN(0x00000004, 0xfffffffd)}; - ecp_nistz256_mul_mont(res, in, RR); -} - // P-256 scalar operations. // @@ -89,12 +79,6 @@ void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], int rep); -// beeu_mod_inverse_vartime sets out = a^-1 mod p using a Euclidean algorithm. -// Assumption: 0 < a < p < 2^(256) and p is odd. -int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS], - const BN_ULONG a[P256_LIMBS], - const BN_ULONG p[P256_LIMBS]); - // P-256 point operations. //
diff --git a/crypto/fipsmodule/ec/p256-x86_64_test.cc b/crypto/fipsmodule/ec/p256-x86_64_test.cc index 68749e2..8ed1dd4 100644 --- a/crypto/fipsmodule/ec/p256-x86_64_test.cc +++ b/crypto/fipsmodule/ec/p256-x86_64_test.cc
@@ -24,11 +24,8 @@ #include <gtest/gtest.h> #include <openssl/bn.h> -#include <openssl/ec.h> #include <openssl/mem.h> -#include <openssl/nid.h> -#include "internal.h" #include "../bn/internal.h" #include "../../internal.h" #include "../../test/file_test.h" @@ -90,59 +87,6 @@ } } -TEST(P256_X86_64Test, BEEU) { - bssl::UniquePtr<EC_GROUP> group( - EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1)); - ASSERT_TRUE(group); - - BN_ULONG order_words[P256_LIMBS]; - ASSERT_TRUE( - bn_copy_words(order_words, P256_LIMBS, EC_GROUP_get0_order(group.get()))); - - BN_ULONG in[P256_LIMBS], out[P256_LIMBS]; - EC_SCALAR in_scalar, out_scalar, result; - OPENSSL_memset(in, 0, sizeof(in)); - - // Trying to find the inverse of zero should fail. - ASSERT_FALSE(beeu_mod_inverse_vartime(out, in, order_words)); - - // kOneMont is 1, in Montgomery form. - static const BN_ULONG kOneMont[P256_LIMBS] = { - TOBN(0xc46353d, 0x039cdaaf), TOBN(0x43190552, 0x58e8617b), - 0, 0xffffffff, - }; - - for (BN_ULONG i = 1; i < 2000; i++) { - SCOPED_TRACE(i); - - in[0] = i; - if (i >= 1000) { - in[1] = i << 8; - in[2] = i << 32; - in[3] = i << 48; - } else { - in[1] = in[2] = in[3] = 0; - } - - EXPECT_TRUE(bn_less_than_words(in, order_words, P256_LIMBS)); - ASSERT_TRUE(beeu_mod_inverse_vartime(out, in, order_words)); - EXPECT_TRUE(bn_less_than_words(out, order_words, P256_LIMBS)); - - // Calculate out*in and confirm that it equals one, modulo the order. - OPENSSL_memcpy(in_scalar.bytes, in, sizeof(in)); - OPENSSL_memcpy(out_scalar.bytes, out, sizeof(out)); - ec_scalar_to_montgomery(group.get(), &in_scalar, &in_scalar); - ec_scalar_to_montgomery(group.get(), &out_scalar, &out_scalar); - ec_scalar_mul_montgomery(group.get(), &result, &in_scalar, &out_scalar); - - EXPECT_EQ(0, OPENSSL_memcmp(kOneMont, &result, sizeof(kOneMont))); - - // Invert the result and expect to get back to the original value. - ASSERT_TRUE(beeu_mod_inverse_vartime(out, out, order_words)); - EXPECT_EQ(0, OPENSSL_memcmp(in, out, sizeof(in))); - } -} - static bool GetFieldElement(FileTest *t, BN_ULONG out[P256_LIMBS], const char *name) { std::vector<uint8_t> bytes;
diff --git a/crypto/fipsmodule/ec/scalar.c b/crypto/fipsmodule/ec/scalar.c index 88678a9..1bd6b02 100644 --- a/crypto/fipsmodule/ec/scalar.c +++ b/crypto/fipsmodule/ec/scalar.c
@@ -74,8 +74,3 @@ const EC_SCALAR *a) { group->meth->scalar_inv_montgomery(group, r, a); } - -int ec_scalar_inv_montgomery_vartime(const EC_GROUP *group, EC_SCALAR *r, - const EC_SCALAR *a) { - return group->meth->scalar_inv_montgomery_vartime(group, r, a); -}
diff --git a/crypto/fipsmodule/ec/simple.c b/crypto/fipsmodule/ec/simple.c index 854da1e..5c63711 100644 --- a/crypto/fipsmodule/ec/simple.c +++ b/crypto/fipsmodule/ec/simple.c
@@ -570,55 +570,3 @@ // The points are equal. return 0; } - -int ec_GFp_simple_mont_inv_mod_ord_vartime(const EC_GROUP *group, - EC_SCALAR *out, - const EC_SCALAR *in) { - // This implementation (in fact) runs in constant time, - // even though for this interface it is not mandatory. - - // out = in^-1 in the Montgomery domain. This is - // |ec_scalar_to_montgomery| followed by |ec_scalar_inv_montgomery|, but - // |ec_scalar_inv_montgomery| followed by |ec_scalar_from_montgomery| is - // equivalent and slightly more efficient. - ec_scalar_inv_montgomery(group, out, in); - ec_scalar_from_montgomery(group, out, out); - return 1; -} - -// Compares the x (affine) coordinate of the point p with x. -// Return 1 on success 0 otherwise -// Assumption: the caller starts the BN_CTX. -int ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, const EC_POINT *p, - const BIGNUM *r, BN_CTX *ctx) { - int ret = 0; - BN_CTX_start(ctx); - - BIGNUM *X = BN_CTX_get(ctx); - if (X == NULL) { - OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB); - goto out; - } - - if (!EC_POINT_get_affine_coordinates_GFp(group, p, X, NULL, ctx)) { - OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB); - goto out; - } - - if (!ec_field_element_to_scalar(group, X)) { - OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB); - goto out; - } - - // The signature is correct iff |X| is equal to |r|. - if (BN_ucmp(X, r) != 0) { - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); - goto out; - } - - ret = 1; - -out: - BN_CTX_end(ctx); - return ret; -}
diff --git a/crypto/fipsmodule/ecdsa/ecdsa.c b/crypto/fipsmodule/ecdsa/ecdsa.c index 7ec4565..f3ce214 100644 --- a/crypto/fipsmodule/ecdsa/ecdsa.c +++ b/crypto/fipsmodule/ecdsa/ecdsa.c
@@ -98,6 +98,40 @@ order->width); } +// field_element_to_scalar reduces |r| modulo |group->order|. |r| must +// previously have been reduced modulo |group->field|. +static int field_element_to_scalar(const EC_GROUP *group, BIGNUM *r) { + // We must have p < 2×order, assuming p is not tiny (p >= 17). Thus rather we + // can reduce by performing at most one subtraction. + // + // Proof: We only work with prime order curves, so the number of points on + // the curve is the order. Thus Hasse's theorem gives: + // + // |order - (p + 1)| <= 2×sqrt(p) + // p + 1 - order <= 2×sqrt(p) + // p + 1 - 2×sqrt(p) <= order + // p + 1 - 2×(p/4) < order (p/4 > sqrt(p) for p >= 17) + // p/2 < p/2 + 1 < order + // p < 2×order + // + // Additionally, one can manually check this property for built-in curves. It + // is enforced for legacy custom curves in |EC_GROUP_set_generator|. + // + // TODO(davidben): Introduce |EC_FIELD_ELEMENT|, make this a function from + // |EC_FIELD_ELEMENT| to |EC_SCALAR|, and cut out the |BIGNUM|. Does this need + // to be constant-time for signing? |r| is the x-coordinate for kG, which is + // public unless k was rerolled because |s| was zero. + assert(!BN_is_negative(r)); + assert(BN_cmp(r, &group->field) < 0); + if (BN_cmp(r, &group->order) >= 0 && + !BN_sub(r, r, &group->order)) { + return 0; + } + assert(!BN_is_negative(r)); + assert(BN_cmp(r, &group->order) < 0); + return 1; +} + ECDSA_SIG *ECDSA_SIG_new(void) { ECDSA_SIG *sig = OPENSSL_malloc(sizeof(ECDSA_SIG)); if (sig == NULL) { @@ -159,6 +193,12 @@ } int ret = 0; EC_POINT *point = NULL; + BN_CTX_start(ctx); + BIGNUM *X = BN_CTX_get(ctx); + if (X == NULL) { + OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB); + goto err; + } EC_SCALAR r, s, u1, u2, s_inv_mont, m; if (BN_is_zero(sig->r) || @@ -170,7 +210,11 @@ } // s_inv_mont = s^-1 in the Montgomery domain. This is - ec_scalar_inv_montgomery_vartime(group, &s_inv_mont, &s); + // |ec_scalar_to_montgomery| followed by |ec_scalar_inv_montgomery|, but + // |ec_scalar_inv_montgomery| followed by |ec_scalar_from_montgomery| is + // equivalent and slightly more efficient. + ec_scalar_inv_montgomery(group, &s_inv_mont, &s); + ec_scalar_from_montgomery(group, &s_inv_mont, &s_inv_mont); // u1 = m * s^-1 mod order // u2 = r * s^-1 mod order @@ -190,10 +234,24 @@ OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB); goto err; } + if (!EC_POINT_get_affine_coordinates_GFp(group, point, X, NULL, ctx)) { + OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB); + goto err; + } + if (!field_element_to_scalar(group, X)) { + OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB); + goto err; + } + // The signature is correct iff |X| is equal to |sig->r|. + if (BN_ucmp(X, sig->r) != 0) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); + goto err; + } - ret = ec_cmp_x_coordinate(group, point, sig->r, ctx); + ret = 1; err: + BN_CTX_end(ctx); BN_CTX_free(ctx); EC_POINT_free(point); return ret; @@ -262,7 +320,7 @@ goto err; } - if (!ec_field_element_to_scalar(group, r)) { + if (!field_element_to_scalar(group, r)) { goto err; } } while (BN_is_zero(r));
diff --git a/third_party/fiat/p256.c b/third_party/fiat/p256.c index b6f3f49..2d40ea0 100644 --- a/third_party/fiat/p256.c +++ b/third_party/fiat/p256.c
@@ -1836,8 +1836,6 @@ out->bignum_to_felem = ec_GFp_mont_bignum_to_felem; out->felem_to_bignum = ec_GFp_mont_felem_to_bignum; out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery; - out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime; - out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate; }; #undef BORINGSSL_NISTP256_64BIT