Rework ML-DSA modular operations

Most of the uses of constant_time_lt were unnecessary and can be
simpler. constant_time_lt needs to perform extra operations because the
inputs may use the full bit width of the input, but these values are
known to be smaller. We largely only need to select by the MSB of some
values.

Add a constant_time_select_32 so we don't have to implicitly cast
to/from int.

Remove a comment about unary minus and MSVC. We do that throughout the
codebase already.

Finally, matching the ML-KEM implementation, remove a
vectorization-impeding value barrier. This is... disappointing, but
seems to be a significant performance difference. Like in ML-KEM, this
was broadly okay except where we sample some value, where Clang was
tempted too far into misbehaving.

Apple M1 Pro, Clang:
Before:
Did 38824 MLDSA key generation operations in 4021430us (9654.3 ops/sec)
Did 7950 MLDSA sign (randomized) operations in 4072620us (1952.1 ops/sec)
Did 1163000 MLDSA parse (valid) public key operations in 4001540us (290638.1 ops/sec)
Did 40320 MLDSA verify (valid signature) operations in 4024830us (10017.8 ops/sec)
Did 40180 MLDSA verify (invalid signature) operations in 4009297us (10021.7 ops/sec)
After:
Did 48655 MLDSA key generation operations in 4020313us (12102.3 ops/sec) [+25.4%]
Did 13361 MLDSA sign (randomized) operations in 4078864us (3275.7 ops/sec) [+67.8%]
Did 1158000 MLDSA parse (valid) public key operations in 4000017us (289498.8 ops/sec) [-0.4%]
Did 56000 MLDSA verify (valid signature) operations in 4051698us (13821.4 ops/sec) [+38.0%]
Did 56000 MLDSA verify (invalid signature) operations in 4062468us (13784.7 ops/sec) [+37.5%]

Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz, GCC:
Before:
Did 17346 MLDSA key generation operations in 4019390us (4315.6 ops/sec)
Did 3444 MLDSA sign (randomized) operations in 4066107us (847.0 ops/sec)
Did 494000 MLDSA parse (valid) public key operations in 4004318us (123366.8 ops/sec)
Did 16842 MLDSA verify (valid signature) operations in 4093079us (4114.8 ops/sec)
Did 17220 MLDSA verify (invalid signature) operations in 4089998us (4210.3 ops/sec)
After:
Did 23058 MLDSA key generation operations in 4030723us (5720.6 ops/sec) [+32.6%]
Did 6534 MLDSA sign (randomized) operations in 4061126us (1608.9 ops/sec) [+90.0%]
Did 494000 MLDSA parse (valid) public key operations in 4002108us (123434.9 ops/sec) [+0.1%]
Did 26180 MLDSA verify (valid signature) operations in 4045953us (6470.7 ops/sec) [+57.3%]
Did 25800 MLDSA verify (invalid signature) operations in 4009973us (6434.0 ops/sec) [+52.8%]

Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz, Clang:
Before:
Did 17499 MLDSA key generation operations in 4059819us (4310.3 ops/sec)
Did 3520 MLDSA sign (randomized) operations in 4070484us (864.8 ops/sec)
Did 494000 MLDSA parse (valid) public key operations in 4003764us (123383.9 ops/sec)
Did 16926 MLDSA verify (valid signature) operations in 4029917us (4200.1 ops/sec)
Did 17220 MLDSA verify (invalid signature) operations in 4099146us (4200.9 ops/sec)
After:
Did 23104 MLDSA key generation operations in 4036297us (5724.1 ops/sec) [+32.8%]
Did 6336 MLDSA sign (randomized) operations in 4006447us (1581.5 ops/sec) [+82.9%]
Did 494000 MLDSA parse (valid) public key operations in 4005244us (123338.3 ops/sec) [-0.0%]
Did 26460 MLDSA verify (valid signature) operations in 4081059us (6483.6 ops/sec) [+54.4%]
Did 26120 MLDSA verify (invalid signature) operations in 4021846us (6494.5 ops/sec) [+54.6%]

Change-Id: I9f010ca1dde37a306e4a207caa12ec4feb920716
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/82527
Auto-Submit: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/mldsa/mldsa.cc.inc b/crypto/fipsmodule/mldsa/mldsa.cc.inc
index 630d77e..f63c0b2 100644
--- a/crypto/fipsmodule/mldsa/mldsa.cc.inc
+++ b/crypto/fipsmodule/mldsa/mldsa.cc.inc
@@ -281,36 +281,56 @@
 uint32_t reduce_once(uint32_t x) {
   declassify_assert(x < 2 * kPrime);
   // return x < kPrime ? x : x - kPrime;
-  return constant_time_select_int(constant_time_lt_w(x, kPrime), x, x - kPrime);
+  const uint32_t subtracted = x - kPrime;
+  uint32_t mask = 0u - (subtracted >> 31);
+  // Although this is a constant-time select, we omit a value barrier here.
+  // Value barriers impede auto-vectorization (likely because it forces the
+  // value to transit through a general-purpose register). This is a difference
+  // of 1.4x to 1.5x in signing performance.
+  //
+  // We usually add value barriers to selects because Clang turns consecutive
+  // selects with the same condition into a branch instead of CMOV/CSEL. This
+  // condition does not occur in ML-DSA, so omitting it seems to be generally
+  // safe. However, see |coefficient_from_nibble|.
+  return (mask & x) | (~mask & subtracted);
 }
 
-// Returns the absolute value in constant time.
+// Returns the absolute value in constant time, interpreting the high bit as a
+// sign bit.
 uint32_t abs_signed(uint32_t x) {
-  // return is_positive(x) ? x : -x;
-  // Note: MSVC doesn't like applying the unary minus operator to unsigned types
-  // (warning C4146), so we write the negation as a bitwise not plus one
-  // (assuming two's complement representation).
-  return constant_time_select_int(constant_time_lt_w(x, 0x80000000), x, 0u - x);
+  // return is_negative(x) ? -x : x;
+  uint32_t mask = 0u - (x >> 31);
+  return constant_time_select_32(mask, 0u - x, x);
 }
 
 // Returns the absolute value modulo kPrime.
 uint32_t abs_mod_prime(uint32_t x) {
   declassify_assert(x < kPrime);
-  // return x > kHalfPrime ? kPrime - x : x;
-  return constant_time_select_int(constant_time_lt_w(kHalfPrime, x), kPrime - x,
-                                  x);
+  // return x <= kHalfPrime ? x : kPrime - x;
+  uint32_t mask = x - kHalfPrime - 1;
+  mask = 0u - (mask >> 31);
+  return constant_time_select_32(mask, x, kPrime - x);
 }
 
-// Returns the maximum of two values in constant time.
-uint32_t maximum(uint32_t x, uint32_t y) {
+// Returns the maximum of two values in constant time. Each value must be less
+// than kPrime.
+uint32_t maximum_reduced(uint32_t x, uint32_t y) {
+  declassify_assert(x < kPrime);
+  declassify_assert(y < kPrime);
   // return x < y ? y : x;
-  return constant_time_select_int(constant_time_lt_w(x, y), y, x);
+  uint32_t mask = x - y;
+  mask = 0u - (mask >> 31);
+  return constant_time_select_32(mask, y, x);
 }
 
 uint32_t mod_sub(uint32_t a, uint32_t b) {
   declassify_assert(a < kPrime);
   declassify_assert(b < kPrime);
-  return reduce_once(kPrime + a - b);
+  uint32_t r = a - b;
+  // return r < 0 ? r + kPrime : r;
+  uint32_t mask = 0u - (r >> 31);
+  // See |reduce_once| for which this does not have a value barrier.
+  return (mask & (r + kPrime)) | (~mask & r);
 }
 
 void scalar_add(scalar *out, const scalar *lhs, const scalar *rhs) {
@@ -472,9 +492,9 @@
   crypto_word_t mask =
       constant_time_lt_w((uint32_t)(1 << (kDroppedBits - 1)), *r0);
   // r0 = mask ? r0_adjusted : r0
-  *r0 = constant_time_select_int(mask, r0_adjusted, *r0);
+  *r0 = constant_time_select_32(mask, r0_adjusted, *r0);
   // r1 = mask ? r1_adjusted : r1
-  *r1 = constant_time_select_int(mask, r1_adjusted, *r1);
+  *r1 = constant_time_select_32(mask, r1_adjusted, *r1);
 }
 
 // Scale back previously rounded value.
@@ -621,14 +641,14 @@
 void scalar_max(uint32_t *max, const scalar *s) {
   for (int i = 0; i < kDegree; i++) {
     uint32_t abs = abs_mod_prime(s->c[i]);
-    *max = maximum(*max, abs);
+    *max = maximum_reduced(*max, abs);
   }
 }
 
 void scalar_max_signed(uint32_t *max, const scalar *s) {
   for (int i = 0; i < kDegree; i++) {
     uint32_t abs = abs_signed(s->c[i]);
-    *max = maximum(*max, abs);
+    *max = maximum_reduced(*max, abs);
   }
 }
 
@@ -1133,7 +1153,9 @@
 template <>
 bool coefficient_from_nibble<4>(uint32_t nibble, uint32_t *result) {
   if (constant_time_declassify_int(nibble < 9)) {
-    *result = mod_sub(4, nibble);
+    // Knowing bounds on |nibble| seems to tempt some versions of Clang to emit
+    // a branch, if we don't have a barrier in |mod_sub|.
+    *result = mod_sub(4, value_barrier_u32(nibble));
     return true;
   }
   return false;
@@ -1142,7 +1164,9 @@
 template <>
 bool coefficient_from_nibble<2>(uint32_t nibble, uint32_t *result) {
   if (constant_time_declassify_int(nibble < 15)) {
-    *result = mod_sub(2, nibble % 5);
+    // Knowing bounds on |nibble| seems to tempt some versions of Clang to emit
+    // a branch, if we don't have a barrier in |mod_sub|.
+    *result = mod_sub(2, value_barrier_u32(nibble % 5));
     return true;
   }
   return false;
diff --git a/crypto/internal.h b/crypto/internal.h
index dc504b4..30ec7df 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -405,8 +405,16 @@
 // constant_time_select_int acts like |constant_time_select| but operates on
 // ints.
 static inline int constant_time_select_int(crypto_word_t mask, int a, int b) {
-  return (int)(constant_time_select_w(mask, (crypto_word_t)(a),
-                                      (crypto_word_t)(b)));
+  return static_cast<int>(constant_time_select_w(
+      mask, static_cast<crypto_word_t>(a), static_cast<crypto_word_t>(b)));
+}
+
+// constant_time_select_32 acts like |constant_time_select| but operates on
+// 32-bit values.
+static inline uint32_t constant_time_select_32(crypto_word_t mask, uint32_t a,
+                                               uint32_t b) {
+  return static_cast<uint32_t>(
+      constant_time_select_w(mask, crypto_word_t{a}, crypto_word_t{b}));
 }
 
 // constant_time_conditional_memcpy copies |n| bytes from |src| to |dst| if