Fix beeu_mod_inverse_vartime on aarch64 https://boringssl-review.googlesource.com/c/boringssl/+/51805 added an aarch64 version of the x86_64 beeu_mod_inverse_vartime. However, it did not quite get the shift operations right for A and B. If clz happened to return 64, the SHIFT256 macro bumps into aarch64 only looking at the bottom 6 bits of the shift operand. That means, instead of shifting the value, it jumbles the bits up a bit. Fix this by clamping the shift amount to 63 bits at a time. (There is nothing significant about how much we clamp the shift. The file was already trying to clamp the shift to 64 bits, though unsuccessfully. The x86_64 clamps it to 27 bits, because it needs to compute the shift amount very differently. Though this does mean that the Python pseudocode isn't right because it omits clmaping.) This function is only used in P-256 ECDSA verification (not signing), on aarch64 non-small builds. Because it still outputs *some* scalar, we do not believe it (or really most mod inverse math bugs) can be meaningfully used to forge ECDSA signatures: Suppose an attacker could find some (msg, r, s) tuple that triggered this bug and caused ECDSA verification to pass. ECDSA verification only uses this function to compute inv(s) and afterwards never uses inv or s again. The bug still outputs some valid scalar, so we have two cases: Case 1: buggy_inv(s) != 0 Suppose the attacker instead passed (msg, r, correct_inv(buggy_inv(s)) to a correct ECDSA verifier. The correct ECDSA verifier would then compute s_inv = buggy_inv(s) and then proceed the same as a verifier with this bug. If that passes, that means the attacker must have been able to break ECDSA without this bug anyway. Case 2: buggy_inv(s) = 0 If this bug causes ECDSA verification to proceed with s_inv = 0, this would bypass the s != 0 check at the start of verification. However, we would then compute u1 = u2 = 0 and then 0 * pub + 0 * G = infinity. Extracting the x-coordinate would then fail, so the signature would not be accepted anyway. Put another way, ECDSA signatures should simply have been (r, s^-1), not (r, s). It's less work for the verifier, while both s and s^-1 are equally easy for a signer to compute. Change-Id: I91f44b57811f3eddd3f1973a7dc078c48e0329d7 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/93630 Presubmit-BoringSSL-Verified: boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com> Commit-Queue: Adam Langley <agl@google.com> Auto-Submit: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>

diff --git a/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl b/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl
index e215969..38e146d 100644
--- a/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl
+++ b/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl

@@ -193,6 +193,8 @@
 # - lsl must precede the lsr of the same register afterwards.
 # The chosen order of the instructions overall is to try and maximize
 # the pipeline usage.
+#
+# shift must be at most 63.
 sub SHIFT256 {
   my ($var0, $var1, $var2, $var3) = @_;
   return <<___;
@@ -285,6 +287,7 @@
 
     // shift := number of trailing 0s in $b0
     // (      = number of leading 0s in $t1; see the "rbit" instruction in TEST_B_ZERO)
+    orr     $t1, $t1, #1	// Clamp the shift to 63 bits (see SHIFT256)
     clz     $shift, $t1
 
     // If there is no shift, goto shift_A_Y
@@ -309,6 +312,7 @@
     // - The loop is only used to call SHIFT1(X)
     //   and $shift is decreased while executing the X loop.
     // - SHIFT256(B, $shift) is performed before right-shifting X; they are independent
+    // - "$shift" is clamped to 63 bits.
 
 .Lbeeu_shift_A_Y:
     // Same for A and Y.
@@ -316,6 +320,7 @@
     // Reverse the bit order of $a0
     // $shift := number of trailing 0s in $a0 (= number of leading 0s in $t1)
     rbit    $t1, $a0
+    orr     $t1, $t1, #1	// Clamp the shift to 63 bits (see SHIFT256)
     clz     $shift, $t1
 
     // If there is no shift, goto |B-A|, X+Y update

diff --git a/crypto/fipsmodule/ec/p256-nistz_test.cc b/crypto/fipsmodule/ec/p256-nistz_test.cc
index 49534ca..4f24e96 100644
--- a/crypto/fipsmodule/ec/p256-nistz_test.cc
+++ b/crypto/fipsmodule/ec/p256-nistz_test.cc

@@ -146,94 +146,6 @@
   CHECK_ABI(impl().select_w7, &val, table, 42);
 }
 
-TEST(P256_NistzTest, BEEU) {
-#if defined(OPENSSL_X86_64)
-  if (!CRYPTO_is_AVX_capable()) {
-    // No AVX support; cannot run the BEEU code.
-    return;
-  }
-#endif
-
-  const EC_GROUP *group = EC_group_p256();
-  BN_ULONG order_words[P256_LIMBS];
-  ASSERT_TRUE(
-      bn_copy_words(order_words, P256_LIMBS, EC_GROUP_get0_order(group)));
-
-  BN_ULONG in[P256_LIMBS], out[P256_LIMBS];
-  EC_SCALAR in_scalar, out_scalar, result;
-  OPENSSL_memset(in, 0, sizeof(in));
-
-  // Trying to find the inverse of zero should fail.
-  ASSERT_FALSE(beeu_mod_inverse_vartime(out, in, order_words));
-  // This is not a constant-time function, so instrument both zero and a few
-  // inputs below.
-  ASSERT_FALSE(CHECK_ABI(beeu_mod_inverse_vartime, out, in, order_words));
-
-  // kOneMont is 1, in Montgomery form.
-  static const BN_ULONG kOneMont[P256_LIMBS] = {
-      TOBN(0xc46353d, 0x039cdaaf),
-      TOBN(0x43190552, 0x58e8617b),
-      0,
-      0xffffffff,
-  };
-
-  for (BN_ULONG i = 1; i < 2000; i++) {
-    SCOPED_TRACE(i);
-
-    in[0] = i;
-    if (i >= 1000) {
-      in[1] = i << 8;
-      in[2] = i << 32;
-      in[3] = i << 48;
-    } else {
-      in[1] = in[2] = in[3] = 0;
-    }
-
-    EXPECT_TRUE(bn_less_than_words(in, order_words, P256_LIMBS));
-    ASSERT_TRUE(beeu_mod_inverse_vartime(out, in, order_words));
-    EXPECT_TRUE(bn_less_than_words(out, order_words, P256_LIMBS));
-
-    // Calculate out*in and confirm that it equals one, modulo the order.
-    OPENSSL_memcpy(in_scalar.words, in, sizeof(in));
-    OPENSSL_memcpy(out_scalar.words, out, sizeof(out));
-    ec_scalar_to_montgomery(group, &in_scalar, &in_scalar);
-    ec_scalar_to_montgomery(group, &out_scalar, &out_scalar);
-    ec_scalar_mul_montgomery(group, &result, &in_scalar, &out_scalar);
-
-    EXPECT_EQ(0, OPENSSL_memcmp(kOneMont, &result, sizeof(kOneMont)));
-
-    // Invert the result and expect to get back to the original value.
-    ASSERT_TRUE(beeu_mod_inverse_vartime(out, out, order_words));
-    EXPECT_EQ(0, OPENSSL_memcmp(in, out, sizeof(in)));
-
-    if (i < 5) {
-      EXPECT_TRUE(CHECK_ABI(beeu_mod_inverse_vartime, out, in, order_words));
-    }
-  }
-}
-
-static bool GetFieldElement(FileTest *t, BN_ULONG out[P256_LIMBS],
-                            const char *name) {
-  std::vector<uint8_t> bytes;
-  if (!t->GetBytes(&bytes, name)) {
-    return false;
-  }
-
-  if (bytes.size() != BN_BYTES * P256_LIMBS) {
-    ADD_FAILURE() << "Invalid length: " << name;
-    return false;
-  }
-
-  // |byte| contains bytes in big-endian while |out| should contain |BN_ULONG|s
-  // in little-endian.
-  OPENSSL_memset(out, 0, P256_LIMBS * sizeof(BN_ULONG));
-  for (size_t i = 0; i < bytes.size(); i++) {
-    out[P256_LIMBS - 1 - (i / BN_BYTES)] <<= 8;
-    out[P256_LIMBS - 1 - (i / BN_BYTES)] |= bytes[i];
-  }
-
-  return true;
-}
 
 static std::string FieldElementToString(const BN_ULONG a[P256_LIMBS]) {
   std::string ret;
@@ -262,6 +174,107 @@
 #define EXPECT_FIELD_ELEMENTS_EQUAL(a, b) \
   EXPECT_PRED_FORMAT2(ExpectFieldElementsEqual, a, b)
 
+// P-256 scalars and field elements are the same size.
+#define EXPECT_SCALARS_EQUAL(a, b) EXPECT_FIELD_ELEMENTS_EQUAL(a, b)
+
+TEST(P256_NistzTest, BEEU) {
+#if defined(OPENSSL_X86_64)
+  if (!CRYPTO_is_AVX_capable()) {
+    // No AVX support; cannot run the BEEU code.
+    return;
+  }
+#endif
+
+  const EC_GROUP *group = EC_group_p256();
+  BN_ULONG order_words[P256_LIMBS];
+  ASSERT_TRUE(
+      bn_copy_words(order_words, P256_LIMBS, EC_GROUP_get0_order(group)));
+
+  BN_ULONG in[P256_LIMBS], out[P256_LIMBS];
+  EC_SCALAR in_scalar, out_scalar, result;
+  OPENSSL_memset(in, 0, sizeof(in));
+
+  // Trying to find the inverse of zero should fail.
+  ASSERT_FALSE(beeu_mod_inverse_vartime(out, in, order_words));
+  // This is not a constant-time function, so instrument both zero and a few
+  // inputs below.
+  ASSERT_FALSE(CHECK_ABI(beeu_mod_inverse_vartime, out, in, order_words));
+
+  BN_ULONG kOne[P256_LIMBS] = {};
+  kOne[0] = 1;
+
+  auto beeu_test = [&] {
+    EXPECT_TRUE(bn_less_than_words(in, order_words, P256_LIMBS));
+    ASSERT_TRUE(beeu_mod_inverse_vartime(out, in, order_words));
+    EXPECT_TRUE(bn_less_than_words(out, order_words, P256_LIMBS));
+
+    // Calculate out*in and confirm that it equals one, modulo the order. We
+    // only have Montgomery multiplication, but if we convert one input, and not
+    // the other, this gives the result outside the Montgomery domain.
+    OPENSSL_memcpy(in_scalar.words, in, sizeof(in));
+    OPENSSL_memcpy(out_scalar.words, out, sizeof(out));
+    ec_scalar_to_montgomery(group, &in_scalar, &in_scalar);
+    ec_scalar_mul_montgomery(group, &result, &in_scalar, &out_scalar);
+    EXPECT_SCALARS_EQUAL(kOne, result.words);
+
+    // Invert the result and expect to get back to the original value.
+    ASSERT_TRUE(beeu_mod_inverse_vartime(out, out, order_words));
+    EXPECT_SCALARS_EQUAL(in, out);
+  };
+
+  for (BN_ULONG i = 1; i < 2000; i++) {
+    SCOPED_TRACE(i);
+    in[0] = i;
+    if (i >= 1000) {
+      in[1] = i << 8;
+      in[2] = i << 32;
+      in[3] = i << 48;
+    } else {
+      in[1] = in[2] = in[3] = 0;
+    }
+
+    beeu_test();
+    if (i < 5) {
+      EXPECT_TRUE(CHECK_ABI(beeu_mod_inverse_vartime, out, in, order_words));
+    }
+  }
+
+  for (int i = 0; i < 255; i++) {
+    SCOPED_TRACE(i);
+    // Test |in| = 2^i.
+    OPENSSL_memset(in, 0, sizeof(in));
+    in[i / BN_BITS2] = BN_ULONG{1} << (i % BN_BITS2);
+    beeu_test();
+
+    // Test |in| = N - 2^i.
+    bn_sub_words(in, order_words, in, P256_LIMBS);
+    beeu_test();
+  }
+}
+
+static bool GetFieldElement(FileTest *t, BN_ULONG out[P256_LIMBS],
+                            const char *name) {
+  std::vector<uint8_t> bytes;
+  if (!t->GetBytes(&bytes, name)) {
+    return false;
+  }
+
+  if (bytes.size() != BN_BYTES * P256_LIMBS) {
+    ADD_FAILURE() << "Invalid length: " << name;
+    return false;
+  }
+
+  // |byte| contains bytes in big-endian while |out| should contain |BN_ULONG|s
+  // in little-endian.
+  OPENSSL_memset(out, 0, P256_LIMBS * sizeof(BN_ULONG));
+  for (size_t i = 0; i < bytes.size(); i++) {
+    out[P256_LIMBS - 1 - (i / BN_BYTES)] <<= 8;
+    out[P256_LIMBS - 1 - (i / BN_BYTES)] |= bytes[i];
+  }
+
+  return true;
+}
+
 static bool PointToAffine(P256_POINT_AFFINE *out, const P256_POINT *in) {
   static const uint8_t kP[] = {
       0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,

diff --git a/gen/bcm/p256_beeu-armv8-asm-apple.S b/gen/bcm/p256_beeu-armv8-asm-apple.S
index 1689310..af2f0ef 100644
--- a/gen/bcm/p256_beeu-armv8-asm-apple.S
+++ b/gen/bcm/p256_beeu-armv8-asm-apple.S

@@ -81,6 +81,7 @@
 
     // shift := number of trailing 0s in x25
     // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+	orr	x15, x15, #1	// Clamp the shift to 63 bits (see SHIFT256)
 	clz	x13, x15
 
     // If there is no shift, goto shift_A_Y
@@ -138,6 +139,7 @@
     // - The loop is only used to call SHIFT1(X)
     //   and x13 is decreased while executing the X loop.
     // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+    // - "x13" is clamped to 63 bits.
 
 Lbeeu_shift_A_Y:
     // Same for A and Y.
@@ -145,6 +147,7 @@
     // Reverse the bit order of x21
     // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
 	rbit	x15, x21
+	orr	x15, x15, #1	// Clamp the shift to 63 bits (see SHIFT256)
 	clz	x13, x15
 
     // If there is no shift, goto |B-A|, X+Y update

diff --git a/gen/bcm/p256_beeu-armv8-asm-linux.S b/gen/bcm/p256_beeu-armv8-asm-linux.S
index 7253ad4..0c1cf05 100644
--- a/gen/bcm/p256_beeu-armv8-asm-linux.S
+++ b/gen/bcm/p256_beeu-armv8-asm-linux.S

@@ -81,6 +81,7 @@
 
     // shift := number of trailing 0s in x25
     // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+	orr	x15, x15, #1	// Clamp the shift to 63 bits (see SHIFT256)
 	clz	x13, x15
 
     // If there is no shift, goto shift_A_Y
@@ -138,6 +139,7 @@
     // - The loop is only used to call SHIFT1(X)
     //   and x13 is decreased while executing the X loop.
     // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+    // - "x13" is clamped to 63 bits.
 
 .Lbeeu_shift_A_Y:
     // Same for A and Y.
@@ -145,6 +147,7 @@
     // Reverse the bit order of x21
     // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
 	rbit	x15, x21
+	orr	x15, x15, #1	// Clamp the shift to 63 bits (see SHIFT256)
 	clz	x13, x15
 
     // If there is no shift, goto |B-A|, X+Y update

diff --git a/gen/bcm/p256_beeu-armv8-asm-win.S b/gen/bcm/p256_beeu-armv8-asm-win.S
index 534a81e..70440a1 100644
--- a/gen/bcm/p256_beeu-armv8-asm-win.S
+++ b/gen/bcm/p256_beeu-armv8-asm-win.S

@@ -81,6 +81,7 @@
 
     // shift := number of trailing 0s in x25
     // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+	orr	x15, x15, #1	// Clamp the shift to 63 bits (see SHIFT256)
 	clz	x13, x15
 
     // If there is no shift, goto shift_A_Y
@@ -138,6 +139,7 @@
     // - The loop is only used to call SHIFT1(X)
     //   and x13 is decreased while executing the X loop.
     // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+    // - "x13" is clamped to 63 bits.
 
 Lbeeu_shift_A_Y:
     // Same for A and Y.
@@ -145,6 +147,7 @@
     // Reverse the bit order of x21
     // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
 	rbit	x15, x21
+	orr	x15, x15, #1	// Clamp the shift to 63 bits (see SHIFT256)
 	clz	x13, x15
 
     // If there is no shift, goto |B-A|, X+Y update