Rework ML-DSA modular operations Most of the uses of constant_time_lt were unnecessary and can be simpler. constant_time_lt needs to perform extra operations because the inputs may use the full bit width of the input, but these values are known to be smaller. We largely only need to select by the MSB of some values. Add a constant_time_select_32 so we don't have to implicitly cast to/from int. Remove a comment about unary minus and MSVC. We do that throughout the codebase already. Finally, matching the ML-KEM implementation, remove a vectorization-impeding value barrier. This is... disappointing, but seems to be a significant performance difference. Like in ML-KEM, this was broadly okay except where we sample some value, where Clang was tempted too far into misbehaving. Apple M1 Pro, Clang: Before: Did 38824 MLDSA key generation operations in 4021430us (9654.3 ops/sec) Did 7950 MLDSA sign (randomized) operations in 4072620us (1952.1 ops/sec) Did 1163000 MLDSA parse (valid) public key operations in 4001540us (290638.1 ops/sec) Did 40320 MLDSA verify (valid signature) operations in 4024830us (10017.8 ops/sec) Did 40180 MLDSA verify (invalid signature) operations in 4009297us (10021.7 ops/sec) After: Did 48655 MLDSA key generation operations in 4020313us (12102.3 ops/sec) [+25.4%] Did 13361 MLDSA sign (randomized) operations in 4078864us (3275.7 ops/sec) [+67.8%] Did 1158000 MLDSA parse (valid) public key operations in 4000017us (289498.8 ops/sec) [-0.4%] Did 56000 MLDSA verify (valid signature) operations in 4051698us (13821.4 ops/sec) [+38.0%] Did 56000 MLDSA verify (invalid signature) operations in 4062468us (13784.7 ops/sec) [+37.5%] Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz, GCC: Before: Did 17346 MLDSA key generation operations in 4019390us (4315.6 ops/sec) Did 3444 MLDSA sign (randomized) operations in 4066107us (847.0 ops/sec) Did 494000 MLDSA parse (valid) public key operations in 4004318us (123366.8 ops/sec) Did 16842 MLDSA verify (valid signature) operations in 4093079us (4114.8 ops/sec) Did 17220 MLDSA verify (invalid signature) operations in 4089998us (4210.3 ops/sec) After: Did 23058 MLDSA key generation operations in 4030723us (5720.6 ops/sec) [+32.6%] Did 6534 MLDSA sign (randomized) operations in 4061126us (1608.9 ops/sec) [+90.0%] Did 494000 MLDSA parse (valid) public key operations in 4002108us (123434.9 ops/sec) [+0.1%] Did 26180 MLDSA verify (valid signature) operations in 4045953us (6470.7 ops/sec) [+57.3%] Did 25800 MLDSA verify (invalid signature) operations in 4009973us (6434.0 ops/sec) [+52.8%] Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz, Clang: Before: Did 17499 MLDSA key generation operations in 4059819us (4310.3 ops/sec) Did 3520 MLDSA sign (randomized) operations in 4070484us (864.8 ops/sec) Did 494000 MLDSA parse (valid) public key operations in 4003764us (123383.9 ops/sec) Did 16926 MLDSA verify (valid signature) operations in 4029917us (4200.1 ops/sec) Did 17220 MLDSA verify (invalid signature) operations in 4099146us (4200.9 ops/sec) After: Did 23104 MLDSA key generation operations in 4036297us (5724.1 ops/sec) [+32.8%] Did 6336 MLDSA sign (randomized) operations in 4006447us (1581.5 ops/sec) [+82.9%] Did 494000 MLDSA parse (valid) public key operations in 4005244us (123338.3 ops/sec) [-0.0%] Did 26460 MLDSA verify (valid signature) operations in 4081059us (6483.6 ops/sec) [+54.4%] Did 26120 MLDSA verify (invalid signature) operations in 4021846us (6494.5 ops/sec) [+54.6%] Change-Id: I9f010ca1dde37a306e4a207caa12ec4feb920716 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/82527 Auto-Submit: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>

commit: 21544c52bb7e46bfb77c6e8d8273395af2ea3318 [log] [tgz]
author: David Benjamin <davidben@google.com> Tue Sep 30 12:47:33 2025 -0400
committer: Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com> Thu Oct 02 08:56:05 2025 -0700
tree: 83e05b9d0a017bca788dd9a06fff8b2386baf011
parent: 706742e482d89214f13a642ccfcdad596a24a32f [diff]
diff --git a/crypto/fipsmodule/mldsa/mldsa.cc.inc b/crypto/fipsmodule/mldsa/mldsa.cc.inc
index 630d77e..f63c0b2 100644
--- a/crypto/fipsmodule/mldsa/mldsa.cc.inc
+++ b/crypto/fipsmodule/mldsa/mldsa.cc.inc

@@ -281,36 +281,56 @@
 uint32_t reduce_once(uint32_t x) {
   declassify_assert(x < 2 * kPrime);
   // return x < kPrime ? x : x - kPrime;
-  return constant_time_select_int(constant_time_lt_w(x, kPrime), x, x - kPrime);
+  const uint32_t subtracted = x - kPrime;
+  uint32_t mask = 0u - (subtracted >> 31);
+  // Although this is a constant-time select, we omit a value barrier here.
+  // Value barriers impede auto-vectorization (likely because it forces the
+  // value to transit through a general-purpose register). This is a difference
+  // of 1.4x to 1.5x in signing performance.
+  //
+  // We usually add value barriers to selects because Clang turns consecutive
+  // selects with the same condition into a branch instead of CMOV/CSEL. This
+  // condition does not occur in ML-DSA, so omitting it seems to be generally
+  // safe. However, see |coefficient_from_nibble|.
+  return (mask & x) | (~mask & subtracted);
 }
 
-// Returns the absolute value in constant time.
+// Returns the absolute value in constant time, interpreting the high bit as a
+// sign bit.
 uint32_t abs_signed(uint32_t x) {
-  // return is_positive(x) ? x : -x;
-  // Note: MSVC doesn't like applying the unary minus operator to unsigned types
-  // (warning C4146), so we write the negation as a bitwise not plus one
-  // (assuming two's complement representation).
-  return constant_time_select_int(constant_time_lt_w(x, 0x80000000), x, 0u - x);
+  // return is_negative(x) ? -x : x;
+  uint32_t mask = 0u - (x >> 31);
+  return constant_time_select_32(mask, 0u - x, x);
 }
 
 // Returns the absolute value modulo kPrime.
 uint32_t abs_mod_prime(uint32_t x) {
   declassify_assert(x < kPrime);
-  // return x > kHalfPrime ? kPrime - x : x;
-  return constant_time_select_int(constant_time_lt_w(kHalfPrime, x), kPrime - x,
-                                  x);
+  // return x <= kHalfPrime ? x : kPrime - x;
+  uint32_t mask = x - kHalfPrime - 1;
+  mask = 0u - (mask >> 31);
+  return constant_time_select_32(mask, x, kPrime - x);
 }
 
-// Returns the maximum of two values in constant time.
-uint32_t maximum(uint32_t x, uint32_t y) {
+// Returns the maximum of two values in constant time. Each value must be less
+// than kPrime.
+uint32_t maximum_reduced(uint32_t x, uint32_t y) {
+  declassify_assert(x < kPrime);
+  declassify_assert(y < kPrime);
   // return x < y ? y : x;
-  return constant_time_select_int(constant_time_lt_w(x, y), y, x);
+  uint32_t mask = x - y;
+  mask = 0u - (mask >> 31);
+  return constant_time_select_32(mask, y, x);
 }
 
 uint32_t mod_sub(uint32_t a, uint32_t b) {
   declassify_assert(a < kPrime);
   declassify_assert(b < kPrime);
-  return reduce_once(kPrime + a - b);
+  uint32_t r = a - b;
+  // return r < 0 ? r + kPrime : r;
+  uint32_t mask = 0u - (r >> 31);
+  // See |reduce_once| for which this does not have a value barrier.
+  return (mask & (r + kPrime)) | (~mask & r);
 }
 
 void scalar_add(scalar *out, const scalar *lhs, const scalar *rhs) {
@@ -472,9 +492,9 @@
   crypto_word_t mask =
       constant_time_lt_w((uint32_t)(1 << (kDroppedBits - 1)), *r0);
   // r0 = mask ? r0_adjusted : r0
-  *r0 = constant_time_select_int(mask, r0_adjusted, *r0);
+  *r0 = constant_time_select_32(mask, r0_adjusted, *r0);
   // r1 = mask ? r1_adjusted : r1
-  *r1 = constant_time_select_int(mask, r1_adjusted, *r1);
+  *r1 = constant_time_select_32(mask, r1_adjusted, *r1);
 }
 
 // Scale back previously rounded value.
@@ -621,14 +641,14 @@
 void scalar_max(uint32_t *max, const scalar *s) {
   for (int i = 0; i < kDegree; i++) {
     uint32_t abs = abs_mod_prime(s->c[i]);
-    *max = maximum(*max, abs);
+    *max = maximum_reduced(*max, abs);
   }
 }
 
 void scalar_max_signed(uint32_t *max, const scalar *s) {
   for (int i = 0; i < kDegree; i++) {
     uint32_t abs = abs_signed(s->c[i]);
-    *max = maximum(*max, abs);
+    *max = maximum_reduced(*max, abs);
   }
 }
 
@@ -1133,7 +1153,9 @@
 template <>
 bool coefficient_from_nibble<4>(uint32_t nibble, uint32_t *result) {
   if (constant_time_declassify_int(nibble < 9)) {
-    *result = mod_sub(4, nibble);
+    // Knowing bounds on |nibble| seems to tempt some versions of Clang to emit
+    // a branch, if we don't have a barrier in |mod_sub|.
+    *result = mod_sub(4, value_barrier_u32(nibble));
     return true;
   }
   return false;
@@ -1142,7 +1164,9 @@
 template <>
 bool coefficient_from_nibble<2>(uint32_t nibble, uint32_t *result) {
   if (constant_time_declassify_int(nibble < 15)) {
-    *result = mod_sub(2, nibble % 5);
+    // Knowing bounds on |nibble| seems to tempt some versions of Clang to emit
+    // a branch, if we don't have a barrier in |mod_sub|.
+    *result = mod_sub(2, value_barrier_u32(nibble % 5));
     return true;
   }
   return false;

diff --git a/crypto/internal.h b/crypto/internal.h
index dc504b4..30ec7df 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h

@@ -405,8 +405,16 @@
 // constant_time_select_int acts like |constant_time_select| but operates on
 // ints.
 static inline int constant_time_select_int(crypto_word_t mask, int a, int b) {
-  return (int)(constant_time_select_w(mask, (crypto_word_t)(a),
-                                      (crypto_word_t)(b)));
+  return static_cast<int>(constant_time_select_w(
+      mask, static_cast<crypto_word_t>(a), static_cast<crypto_word_t>(b)));
+}
+
+// constant_time_select_32 acts like |constant_time_select| but operates on
+// 32-bit values.
+static inline uint32_t constant_time_select_32(crypto_word_t mask, uint32_t a,
+                                               uint32_t b) {
+  return static_cast<uint32_t>(
+      constant_time_select_w(mask, crypto_word_t{a}, crypto_word_t{b}));
 }
 
 // constant_time_conditional_memcpy copies |n| bytes from |src| to |dst| if
commit	21544c52bb7e46bfb77c6e8d8273395af2ea3318	[log] [tgz]
author	David Benjamin <davidben@google.com>	Tue Sep 30 12:47:33 2025 -0400
committer	Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com>	Thu Oct 02 08:56:05 2025 -0700
tree	83e05b9d0a017bca788dd9a06fff8b2386baf011
parent	706742e482d89214f13a642ccfcdad596a24a32f [diff]