bn/asm/x86_64-mont5.pl: fix carry bug in bn_sqr8x_internal.

CVE-2017-3732

(Imported from upstream's 3f4bcf5bb664b47ed369a70b99fac4e0ad141bb3 and
3e7a496307ab1174c1f8f64eed4454c1c9cde1a8.)

Change-Id: I40255fdf4184e3b919758a72c3d3a7486d91ff65
Reviewed-on: https://boringssl-review.googlesource.com/13360
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 61dd902..61fde2d 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -1852,6 +1852,7 @@
 
 .align	32
 .L8x_tail_done:
+	xor	%rax,%rax
 	add	(%rdx),%r8		# can this overflow?
 	adc	\$0,%r9
 	adc	\$0,%r10
@@ -1859,10 +1860,8 @@
 	adc	\$0,%r12
 	adc	\$0,%r13
 	adc	\$0,%r14
-	adc	\$0,%r15		# can't overflow, because we
-					# started with "overhung" part
-					# of multiplication
-	xor	%rax,%rax
+	adc	\$0,%r15
+	adc	\$0,%rax
 
 	neg	$carry
 .L8x_no_tail:
@@ -3248,6 +3247,7 @@
 
 .align	32
 .Lsqrx8x_tail_done:
+	xor	%rax,%rax
 	add	24+8(%rsp),%r8		# can this overflow?
 	adc	\$0,%r9
 	adc	\$0,%r10
@@ -3255,10 +3255,8 @@
 	adc	\$0,%r12
 	adc	\$0,%r13
 	adc	\$0,%r14
-	adc	\$0,%r15		# can't overflow, because we
-					# started with "overhung" part
-					# of multiplication
-	mov	$carry,%rax		# xor	%rax,%rax
+	adc	\$0,%r15
+	adc	\$0,%rax
 
 	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
 .Lsqrx8x_no_tail:			# %cf is 0 if jumped here
@@ -3273,7 +3271,7 @@
 	adc	8*5($tptr),%r13
 	adc	8*6($tptr),%r14
 	adc	8*7($tptr),%r15
-	adc	%rax,%rax		# top-most carry
+	adc	\$0,%rax		# top-most carry
 
 	mov	32+8(%rsp),%rbx		# n0
 	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
diff --git a/crypto/bn/bn_test.cc b/crypto/bn/bn_test.cc
index 8f93ad0..a152cdf 100644
--- a/crypto/bn/bn_test.cc
+++ b/crypto/bn/bn_test.cc
@@ -515,6 +515,54 @@
   return true;
 }
 
+static bool TestModSquare(FileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
+  bssl::UniquePtr<BIGNUM> m = GetBIGNUM(t, "M");
+  bssl::UniquePtr<BIGNUM> mod_square = GetBIGNUM(t, "ModSquare");
+  if (!a || !m || !mod_square) {
+    return false;
+  }
+
+  bssl::UniquePtr<BIGNUM> a_copy(BN_new());
+  bssl::UniquePtr<BIGNUM> ret(BN_new());
+  if (!ret || !a_copy ||
+      !BN_mod_mul(ret.get(), a.get(), a.get(), m.get(), ctx) ||
+      !ExpectBIGNUMsEqual(t, "A * A (mod M)", mod_square.get(), ret.get()) ||
+      // Repeat the operation with |a_copy|.
+      !BN_copy(a_copy.get(), a.get()) ||
+      !BN_mod_mul(ret.get(), a.get(), a_copy.get(), m.get(), ctx) ||
+      !ExpectBIGNUMsEqual(t, "A * A_copy (mod M)", mod_square.get(),
+                          ret.get())) {
+    return false;
+  }
+
+  if (BN_is_odd(m.get())) {
+    // Reduce |a| and test the Montgomery version.
+    bssl::UniquePtr<BN_MONT_CTX> mont(BN_MONT_CTX_new());
+    bssl::UniquePtr<BIGNUM> a_tmp(BN_new());
+    if (!mont || !a_tmp ||
+        !BN_MONT_CTX_set(mont.get(), m.get(), ctx) ||
+        !BN_nnmod(a_tmp.get(), a.get(), m.get(), ctx) ||
+        !BN_to_montgomery(a_tmp.get(), a_tmp.get(), mont.get(), ctx) ||
+        !BN_mod_mul_montgomery(ret.get(), a_tmp.get(), a_tmp.get(), mont.get(),
+                               ctx) ||
+        !BN_from_montgomery(ret.get(), ret.get(), mont.get(), ctx) ||
+        !ExpectBIGNUMsEqual(t, "A * A (mod M) (Montgomery)",
+                            mod_square.get(), ret.get()) ||
+        // Repeat the operation with |a_copy|.
+        !BN_copy(a_copy.get(), a_tmp.get()) ||
+        !BN_mod_mul_montgomery(ret.get(), a_tmp.get(), a_copy.get(), mont.get(),
+                               ctx) ||
+        !BN_from_montgomery(ret.get(), ret.get(), mont.get(), ctx) ||
+        !ExpectBIGNUMsEqual(t, "A * A_copy (mod M) (Montgomery)",
+                            mod_square.get(), ret.get())) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 static bool TestModExp(FileTest *t, BN_CTX *ctx) {
   bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
   bssl::UniquePtr<BIGNUM> e = GetBIGNUM(t, "E");
@@ -649,6 +697,7 @@
     {"Product", TestProduct},
     {"Quotient", TestQuotient},
     {"ModMul", TestModMul},
+    {"ModSquare", TestModSquare},
     {"ModExp", TestModExp},
     {"Exp", TestExp},
     {"ModSqrt", TestModSqrt},
diff --git a/crypto/bn/bn_tests.txt b/crypto/bn/bn_tests.txt
index 46c788f..c53eb23 100644
--- a/crypto/bn/bn_tests.txt
+++ b/crypto/bn/bn_tests.txt
@@ -9888,6 +9888,16 @@
 M = d78af684e71db0c39cff4e64fb9db567132cb9c50cc98009feb820b26f2ded9b91b9b5e2b83ae0ae4eb4e0523ca726bfbe969b89fd754f674ce99118c3f2d1c5d81fdc7c54e02b60262b241d53c040e99e45826eca37a804668e690e1afc1ca42c9a15d84d4954425f0b7642fc0bd9d7b24e2618d2dcc9b729d944badacfddaf
 
 
+# ModSquare tests.
+#
+# These test vectors satisfy A * A = ModSquare (mod M) and 0 <= ModSquare < M.
+
+# Regression test for CVE-2017-3732.
+ModSquare = fffffffdfffffd01000009000002f6fffdf403000312000402f3fff5f602fe080a0005fdfafffa00010001ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff00000002000002fefffff7fffffd07000109fdfffef3fffdfd06000405ff00fdfbfffe00010001
+A = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff00000000000000ffffffff00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffff00000000
+M = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff00000000000000ffffffff00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffff
+
+
 # ModExp tests.
 #
 # These test vectors satisfy A ^ E = ModExp (mod M) and 0 <= ModExp < M.