ec/ecp_nistz256: harmonize is_infinity with ec_GFp_simple_is_at_infinity.

RT#4625

(Imported from upstream's e3057a57caf4274ea1fb074518e4714059dfcabf.)

Add a test in ec_test to cover the ecp_nistz256_points_mul change. Also
revise the low-level infinity tests to cover the changes in
ecp_nistz256_point_add. Upstream's 'infty' logic was also cleaned up to
be simpler and take advantage of the only cases where |p| is infinity.

Change-Id: Ie22de834bf79ecb6191e824ad9fc1bd6f9681b8b
Reviewed-on: https://boringssl-review.googlesource.com/12225
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
diff --git a/crypto/ec/asm/p256-x86_64-asm.pl b/crypto/ec/asm/p256-x86_64-asm.pl
index 39f9820..a0b4b18 100755
--- a/crypto/ec/asm/p256-x86_64-asm.pl
+++ b/crypto/ec/asm/p256-x86_64-asm.pl
@@ -1974,16 +1974,14 @@
 	mov	$b_org, $a_ptr			# reassign
 	movdqa	%xmm0, $in1_x(%rsp)
 	movdqa	%xmm1, $in1_x+0x10(%rsp)
-	por	%xmm0, %xmm1
 	movdqa	%xmm2, $in1_y(%rsp)
 	movdqa	%xmm3, $in1_y+0x10(%rsp)
-	por	%xmm2, %xmm3
 	movdqa	%xmm4, $in1_z(%rsp)
 	movdqa	%xmm5, $in1_z+0x10(%rsp)
-	por	%xmm1, %xmm3
+	por	%xmm4, %xmm5
 
 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
-	 pshufd	\$0xb1, %xmm3, %xmm5
+	 pshufd	\$0xb1, %xmm5, %xmm3
 	movdqu	0x10($a_ptr), %xmm1
 	movdqu	0x20($a_ptr), %xmm2
 	 por	%xmm3, %xmm5
@@ -1995,14 +1993,14 @@
 	movdqa	%xmm0, $in2_x(%rsp)
 	 pshufd	\$0x1e, %xmm5, %xmm4
 	movdqa	%xmm1, $in2_x+0x10(%rsp)
-	por	%xmm0, %xmm1
-	 movq	$r_ptr, %xmm0			# save $r_ptr
+	movdqu	0x40($a_ptr),%xmm0		# in2_z again
+	movdqu	0x50($a_ptr),%xmm1
 	movdqa	%xmm2, $in2_y(%rsp)
 	movdqa	%xmm3, $in2_y+0x10(%rsp)
-	por	%xmm2, %xmm3
 	 por	%xmm4, %xmm5
 	 pxor	%xmm4, %xmm4
-	por	%xmm1, %xmm3
+	por	%xmm0, %xmm1
+	 movq	$r_ptr, %xmm0			# save $r_ptr
 
 	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
 	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
@@ -2013,8 +2011,8 @@
 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
 
 	pcmpeqd	%xmm4, %xmm5
-	pshufd	\$0xb1, %xmm3, %xmm4
-	por	%xmm3, %xmm4
+	pshufd	\$0xb1, %xmm1, %xmm4
+	por	%xmm1, %xmm4
 	pshufd	\$0, %xmm5, %xmm5		# in1infty
 	pshufd	\$0x1e, %xmm4, %xmm3
 	por	%xmm3, %xmm4
@@ -2346,16 +2344,14 @@
 	 mov	0x40+8*3($a_ptr), $acc0
 	movdqa	%xmm0, $in1_x(%rsp)
 	movdqa	%xmm1, $in1_x+0x10(%rsp)
-	por	%xmm0, %xmm1
 	movdqa	%xmm2, $in1_y(%rsp)
 	movdqa	%xmm3, $in1_y+0x10(%rsp)
-	por	%xmm2, %xmm3
 	movdqa	%xmm4, $in1_z(%rsp)
 	movdqa	%xmm5, $in1_z+0x10(%rsp)
-	por	%xmm1, %xmm3
+	por	%xmm4, %xmm5
 
 	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
-	 pshufd	\$0xb1, %xmm3, %xmm5
+	 pshufd	\$0xb1, %xmm5, %xmm3
 	movdqu	0x10($b_ptr), %xmm1
 	movdqu	0x20($b_ptr), %xmm2
 	 por	%xmm3, %xmm5
diff --git a/crypto/ec/ec_test.cc b/crypto/ec/ec_test.cc
index fad57ad..31619b1 100644
--- a/crypto/ec/ec_test.cc
+++ b/crypto/ec/ec_test.cc
@@ -457,6 +457,51 @@
   return true;
 }
 
+static bool TestMulZero(int nid) {
+  bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(nid));
+  if (!group) {
+    return false;
+  }
+
+  bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group.get()));
+  bssl::UniquePtr<BIGNUM> zero(BN_new());
+  if (!point || !zero) {
+    return false;
+  }
+
+  BN_zero(zero.get());
+  if (!EC_POINT_mul(group.get(), point.get(), zero.get(), nullptr, nullptr,
+                    nullptr)) {
+    return false;
+  }
+
+  if (!EC_POINT_is_at_infinity(group.get(), point.get())) {
+    fprintf(stderr, "g * 0 did not return point at infinity.\n");
+    return false;
+  }
+
+  // Test that zero times an arbitrary point is also infinity. The generator is
+  // used as the arbitrary point.
+  bssl::UniquePtr<EC_POINT> generator(EC_POINT_new(group.get()));
+  bssl::UniquePtr<BIGNUM> one(BN_new());
+  if (!generator ||
+      !one ||
+      !BN_one(one.get()) ||
+      !EC_POINT_mul(group.get(), generator.get(), one.get(), nullptr, nullptr,
+                    nullptr) ||
+      !EC_POINT_mul(group.get(), point.get(), nullptr, generator.get(),
+                    zero.get(), nullptr)) {
+    return false;
+  }
+
+  if (!EC_POINT_is_at_infinity(group.get(), point.get())) {
+    fprintf(stderr, "p * 0 did not return point at infinity.\n");
+    return false;
+  }
+
+  return true;
+}
+
 static bool ForEachCurve(bool (*test_func)(int nid)) {
   const size_t num_curves = EC_get_builtin_curves(nullptr, 0);
   std::vector<EC_builtin_curve> curves(num_curves);
@@ -480,6 +525,7 @@
       !TestSpecifiedCurve() ||
       !ForEachCurve(TestSetAffine) ||
       !ForEachCurve(TestAddingEqualPoints) ||
+      !ForEachCurve(TestMulZero) ||
       !TestArbitraryCurve()) {
     fprintf(stderr, "failed\n");
     return 1;
diff --git a/crypto/ec/p256-x86_64.c b/crypto/ec/p256-x86_64.c
index f0fb794..0a3be92 100644
--- a/crypto/ec/p256-x86_64.c
+++ b/crypto/ec/p256-x86_64.c
@@ -73,6 +73,11 @@
   return (d << 1) + (s & 1);
 }
 
+/* copy_conditional copies |src| to |dst| if |move| is one and leaves it as-is
+ * if |move| is zero.
+ *
+ * WARNING: this breaks the usual convention of constant-time functions
+ * returning masks. */
 static void copy_conditional(BN_ULONG dst[P256_LIMBS],
                              const BN_ULONG src[P256_LIMBS], BN_ULONG move) {
   BN_ULONG mask1 = ((BN_ULONG)0) - move;
@@ -90,6 +95,29 @@
   }
 }
 
+/* is_not_zero returns one iff in != 0 and zero otherwise.
+ *
+ * WARNING: this breaks the usual convention of constant-time functions
+ * returning masks.
+ *
+ * (define-fun is_not_zero ((in (_ BitVec 64))) (_ BitVec 64)
+ *   (bvlshr (bvor in (bvsub #x0000000000000000 in)) #x000000000000003f)
+ * )
+ *
+ * (declare-fun x () (_ BitVec 64))
+ *
+ * (assert (and (= x #x0000000000000000) (= (is_not_zero x) #x0000000000000001)))
+ * (check-sat)
+ *
+ * (assert (and (not (= x #x0000000000000000)) (= (is_not_zero x) #x0000000000000000)))
+ * (check-sat)
+ * */
+static BN_ULONG is_not_zero(BN_ULONG in) {
+  in |= (0 - in);
+  in >>= BN_BITS2 - 1;
+  return in;
+}
+
 /* ecp_nistz256_mod_inverse_mont sets |r| to (|in| * 2^-256)^-1 * 2^256 mod p.
  * That is, |r| is the modular inverse of |in| for input and output in the
  * Montgomery domain. */
@@ -410,7 +438,11 @@
     ecp_nistz256_neg(p.p.Z, p.p.Y);
     copy_conditional(p.p.Y, p.p.Z, wvalue & 1);
 
-    memcpy(p.p.Z, ONE, sizeof(ONE));
+    /* Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p|
+     * is infinity and |ONE| otherwise. |p| was computed from the table, so it
+     * is infinity iff |wvalue >> 1| is zero.  */
+    memset(p.p.Z, 0, sizeof(p.p.Z));
+    copy_conditional(p.p.Z, ONE, is_not_zero(wvalue >> 1));
 
     for (i = 1; i < 37; i++) {
       unsigned off = (index - 1) / 8;
diff --git a/crypto/ec/p256-x86_64_tests.txt b/crypto/ec/p256-x86_64_tests.txt
index 5d0f06b..a680850 100644
--- a/crypto/ec/p256-x86_64_tests.txt
+++ b/crypto/ec/p256-x86_64_tests.txt
@@ -1200,6 +1200,17 @@
 Result.X = 0000000000000000000000000000000000000000000000000000000000000000
 Result.Y = 0000000000000000000000000000000000000000000000000000000000000000
 
+# ∞ + ∞ = ∞, with an alternate representation of ∞.
+Test = PointAdd
+A.X = 2b11cb945c8cf152ffa4c9c2b1c965b019b35d0b7626919ef0ae6cb9d232f8af
+A.Y = 6d333da42e30f7011245b6281015ded14e0f100968e758a1b6c3c083afc14ea0
+A.Z = 0000000000000000000000000000000000000000000000000000000000000000
+B.X = 2b11cb945c8cf152ffa4c9c2b1c965b019b35d0b7626919ef0ae6cb9d232f8af
+B.Y = 6d333da42e30f7011245b6281015ded14e0f100968e758a1b6c3c083afc14ea0
+B.Z = 0000000000000000000000000000000000000000000000000000000000000000
+Result.X = 0000000000000000000000000000000000000000000000000000000000000000
+Result.Y = 0000000000000000000000000000000000000000000000000000000000000000
+
 # g + ∞ = g.
 Test = PointAdd
 A.X = 18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c
@@ -1211,6 +1222,17 @@
 Result.X = 18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c
 Result.Y = 8571ff1825885d85d2e88688dd21f3258b4ab8e4ba19e45cddf25357ce95560a
 
+# g + ∞ = g, with an alternate representation of ∞.
+Test = PointAdd
+A.X = 18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c
+A.Y = 8571ff1825885d85d2e88688dd21f3258b4ab8e4ba19e45cddf25357ce95560a
+A.Z = 00000000fffffffeffffffffffffffffffffffff000000000000000000000001
+B.X = 2b11cb945c8cf152ffa4c9c2b1c965b019b35d0b7626919ef0ae6cb9d232f8af
+B.Y = 6d333da42e30f7011245b6281015ded14e0f100968e758a1b6c3c083afc14ea0
+B.Z = 0000000000000000000000000000000000000000000000000000000000000000
+Result.X = 18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c
+Result.Y = 8571ff1825885d85d2e88688dd21f3258b4ab8e4ba19e45cddf25357ce95560a
+
 # g + -g = ∞.
 Test = PointAdd
 A.X = 18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c