Avoid leaking intermediate states in point doubling special case.

Point addition formulas for short Weierstrass curves are often
incomplete and do not work for P + P. EC implementations usually rely on
constant-time operations never hitting this case, or at least it being
rare[0].

However, the condition checks several values. Our C functions use && and
||, and the P-256 assembly also branches. This can leak intermediate
values via a small side channel. Thanks to David Schrammel and Samuel
Weiser for reporting this.

nistz256 base point multiplication (keygen, ECDSA signing) is unaffected
due to ecp_nistz256_point_add_affine lacking a doubling case. nistp224
and nistp256 base point multiplication, on some compilers, are saved by
quirks of the "mixed" path. The generic code's base point multiplication
and all methods' arbitrary point multiplication (ECDH; ephemeral keys
makes this less interesting) are affected.

Fix the branches in the nistz256 assembly, and use bit operations in C.
Note the C versions are all different because P-224 believes true is 1,
P-256 believes true is any non-zero value, and the generic code believes
true is 0xf...f. This should be double-checked when reviewing.

Aside: The nistz256 assembly also special-cases nontrivial P + (-P) in
arbitrary point multiplication. Fortunately, the formulas in util.c hold
there and I believe one can show P + (-P) is unreachable for all curves.
Still, it would be nice to omit the branch if we can verify the assembly
works anyway.

[0] https://github.com/openssl/openssl/blob/03da376ff7504c63a1d00d57cf41bd7b7e93ff65/crypto/ec/ecp_nistp521.c#L1259

Change-Id: I8958624cd6b5272e5076c6c1605ab089e85f4cb7
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/36465
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
index 6a021db..994cb82 100755
--- a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
+++ b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
@@ -3112,17 +3112,24 @@
 
 	or	$acc5, $acc4			# see if result is zero
 	or	$acc0, $acc4
-	or	$acc1, $acc4
+	or	$acc1, $acc4			# !is_equal(U1, U2)
 
-	.byte	0x3e				# predict taken
-	jnz	.Ladd_proceed$x			# is_equal(U1,U2)?
 	movq	%xmm2, $acc0
 	movq	%xmm3, $acc1
-	test	$acc0, $acc0
-	jnz	.Ladd_proceed$x			# (in1infty || in2infty)?
-	test	$acc1, $acc1
-	jz	.Ladd_double$x			# is_equal(S1,S2)?
+	or	$acc0, $acc4
+	.byte	0x3e				# predict taken
+	jnz	.Ladd_proceed$x			# !is_equal(U1, U2) || in1infty || in2infty
 
+	# We now know A = B or A = -B and neither is infinity. Compare the
+	# y-coordinates via S1 and S2.
+	test	$acc1, $acc1
+	jz	.Ladd_double$x			# is_equal(S1, S2)
+
+	# A = -B, so the result is infinity.
+	#
+	# TODO(davidben): Does .Ladd_proceed handle this case? It seems to, in
+	# which case we should eliminate this special-case and simplify the
+	# timing analysis.
 	movq	%xmm0, $r_ptr			# restore $r_ptr
 	pxor	%xmm0, %xmm0
 	movdqu	%xmm0, 0x00($r_ptr)
diff --git a/crypto/fipsmodule/ec/ec_montgomery.c b/crypto/fipsmodule/ec/ec_montgomery.c
index 6fb32c4..0cf1d91 100644
--- a/crypto/fipsmodule/ec/ec_montgomery.c
+++ b/crypto/fipsmodule/ec/ec_montgomery.c
@@ -282,7 +282,8 @@
   BN_ULONG yneq = ec_felem_non_zero_mask(group, &r);
 
   // This case will never occur in the constant-time |ec_GFp_mont_mul|.
-  if (!xneq && !yneq && z1nz && z2nz) {
+  BN_ULONG is_nontrivial_double = ~xneq & ~yneq & z1nz & z2nz;
+  if (is_nontrivial_double) {
     ec_GFp_mont_dbl(group, out, a);
     return;
   }
diff --git a/crypto/fipsmodule/ec/p224-64.c b/crypto/fipsmodule/ec/p224-64.c
index cc88f15..f8af39b 100644
--- a/crypto/fipsmodule/ec/p224-64.c
+++ b/crypto/fipsmodule/ec/p224-64.c
@@ -758,7 +758,9 @@
   z1_is_zero = p224_felem_is_zero(z1);
   z2_is_zero = p224_felem_is_zero(z2);
   // In affine coordinates, (X_1, Y_1) == (X_2, Y_2)
-  if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
+  p224_limb is_nontrivial_double =
+      x_equal & y_equal & (1 - z1_is_zero) & (1 - z2_is_zero);
+  if (is_nontrivial_double) {
     p224_point_double(x3, y3, z3, x1, y1, z1);
     return;
   }
diff --git a/third_party/fiat/p256.c b/third_party/fiat/p256.c
index 8426beb..23ec71f 100644
--- a/third_party/fiat/p256.c
+++ b/third_party/fiat/p256.c
@@ -321,7 +321,10 @@
 
   limb_t yneq = fe_nz(r);
 
-  if (!xneq && !yneq && z1nz && z2nz) {
+  limb_t is_nontrivial_double = constant_time_is_zero_w(xneq | yneq) &
+                                ~constant_time_is_zero_w(z1nz) &
+                                ~constant_time_is_zero_w(z2nz);
+  if (is_nontrivial_double) {
     point_double(x3, y3, z3, x1, y1, z1);
     return;
   }