Move ML-DSA and Keccak into the FIPS module.

Change-Id: I615e25bbd5056d2149cb9795bb08b2c79abbae5e
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/73327
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/build.json b/build.json
index 9a667f3..a12e63e 100644
--- a/build.json
+++ b/build.json
@@ -68,6 +68,8 @@
             "crypto/fipsmodule/ecdsa/ecdsa.cc.inc",
             "crypto/fipsmodule/hkdf/hkdf.cc.inc",
             "crypto/fipsmodule/hmac/hmac.cc.inc",
+            "crypto/fipsmodule/keccak/keccak.cc.inc",
+            "crypto/fipsmodule/mldsa/mldsa.cc.inc",
             "crypto/fipsmodule/modes/cbc.cc.inc",
             "crypto/fipsmodule/modes/cfb.cc.inc",
             "crypto/fipsmodule/modes/ctr.cc.inc",
@@ -260,7 +262,6 @@
             "crypto/fipsmodule/fips_shared_support.cc",
             "crypto/hpke/hpke.cc",
             "crypto/hrss/hrss.cc",
-            "crypto/keccak/keccak.cc",
             "crypto/kyber/kyber.cc",
             "crypto/lhash/lhash.cc",
             "crypto/md4/md4.cc",
@@ -510,6 +511,7 @@
             "crypto/fipsmodule/ec/p256-nistz.h",
             "crypto/fipsmodule/ec/p256_table.h",
             "crypto/fipsmodule/ecdsa/internal.h",
+            "crypto/fipsmodule/keccak/internal.h",
             "crypto/fipsmodule/modes/internal.h",
             "crypto/fipsmodule/rand/internal.h",
             "crypto/fipsmodule/rsa/internal.h",
@@ -519,11 +521,9 @@
             "crypto/hrss/internal.h",
             "crypto/bcm_support.h",
             "crypto/internal.h",
-            "crypto/keccak/internal.h",
             "crypto/kyber/internal.h",
             "crypto/lhash/internal.h",
             "crypto/md5/internal.h",
-            "crypto/mldsa/internal.h",
             "crypto/mlkem/internal.h",
             "crypto/obj/obj_dat.h",
             "crypto/pkcs7/internal.h",
@@ -828,6 +828,7 @@
             "crypto/fipsmodule/ec/p256_test.cc",
             "crypto/fipsmodule/ecdsa/ecdsa_test.cc",
             "crypto/fipsmodule/hkdf/hkdf_test.cc",
+            "crypto/fipsmodule/keccak/keccak_test.cc",
             "crypto/fipsmodule/modes/gcm_test.cc",
             "crypto/fipsmodule/rand/ctrdrbg_test.cc",
             "crypto/fipsmodule/service_indicator/service_indicator_test.cc",
@@ -836,7 +837,6 @@
             "crypto/hpke/hpke_test.cc",
             "crypto/hrss/hrss_test.cc",
             "crypto/impl_dispatch_test.cc",
-            "crypto/keccak/keccak_test.cc",
             "crypto/kyber/kyber_test.cc",
             "crypto/lhash/lhash_test.cc",
             "crypto/md5/md5_test.cc",
@@ -883,10 +883,10 @@
             "crypto/fipsmodule/ec/p256-nistz_tests.txt",
             "crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt",
             "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt",
+            "crypto/fipsmodule/keccak/keccak_tests.txt",
             "crypto/fipsmodule/rand/ctrdrbg_vectors.txt",
             "crypto/hmac_extra/hmac_tests.txt",
             "crypto/hpke/hpke_test_vectors.txt",
-            "crypto/keccak/keccak_tests.txt",
             "crypto/kyber/kyber_tests.txt",
             "crypto/mldsa/mldsa_nist_keygen_tests.txt",
             "crypto/mldsa/mldsa_nist_siggen_tests.txt",
diff --git a/crypto/fipsmodule/bcm.cc b/crypto/fipsmodule/bcm.cc
index 9ca3020..66e1365 100644
--- a/crypto/fipsmodule/bcm.cc
+++ b/crypto/fipsmodule/bcm.cc
@@ -86,6 +86,8 @@
 #include "ecdsa/ecdsa.cc.inc"
 #include "hkdf/hkdf.cc.inc"
 #include "hmac/hmac.cc.inc"
+#include "keccak/keccak.cc.inc"
+#include "mldsa/mldsa.cc.inc"
 #include "modes/cbc.cc.inc"
 #include "modes/cfb.cc.inc"
 #include "modes/ctr.cc.inc"
diff --git a/crypto/fipsmodule/bcm_interface.h b/crypto/fipsmodule/bcm_interface.h
index c68beea..a349e97 100644
--- a/crypto/fipsmodule/bcm_interface.h
+++ b/crypto/fipsmodule/bcm_interface.h
@@ -51,6 +51,10 @@
   return status == bcm_status::approved || status == bcm_status::not_approved;
 }
 
+OPENSSL_INLINE bcm_status_t bcm_as_approved_status(int result) {
+  return result ? bcm_status::approved : bcm_status::failure;
+}
+
 
 // Random number generator.
 
@@ -235,6 +239,111 @@
                                     SHA512_CTX *sha);
 
 
+// ML-DSA
+//
+// Where not commented, these functions have the same signature as the
+// corresponding public function.
+
+// BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES is the number of bytes of uniformly
+// random entropy necessary to generate a signature in randomized mode.
+#define BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES 32
+
+// BCM_MLDSA_SEED_BYTES is the number of bytes in an ML-DSA seed value.
+#define BCM_MLDSA_SEED_BYTES 32
+
+// BCM_MLDSA65_PRIVATE_KEY_BYTES is the number of bytes in an encoded ML-DSA-65
+// private key.
+#define BCM_MLDSA65_PRIVATE_KEY_BYTES 4032
+
+// BCM_MLDSA65_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-DSA-65
+// public key.
+#define BCM_MLDSA65_PUBLIC_KEY_BYTES 1952
+
+// BCM_MLDSA65_SIGNATURE_BYTES is the number of bytes in an encoded ML-DSA-65
+// signature.
+#define BCM_MLDSA65_SIGNATURE_BYTES 3309
+
+struct BCM_mldsa65_private_key {
+  union {
+    uint8_t bytes[32 + 32 + 64 + 256 * 4 * (5 + 6 + 6)];
+    uint32_t alignment;
+  } opaque;
+};
+
+struct BCM_mldsa65_public_key {
+  union {
+    uint8_t bytes[32 + 64 + 256 * 4 * 6];
+    uint32_t alignment;
+  } opaque;
+};
+
+OPENSSL_EXPORT bcm_status BCM_mldsa65_generate_key(
+    uint8_t out_encoded_public_key[BCM_MLDSA65_PUBLIC_KEY_BYTES],
+    uint8_t out_seed[BCM_MLDSA_SEED_BYTES],
+    struct BCM_mldsa65_private_key *out_private_key);
+
+OPENSSL_EXPORT bcm_status BCM_mldsa65_private_key_from_seed(
+    struct BCM_mldsa65_private_key *out_private_key,
+    const uint8_t seed[BCM_MLDSA_SEED_BYTES]);
+
+OPENSSL_EXPORT bcm_status BCM_mldsa65_public_from_private(
+    struct BCM_mldsa65_public_key *out_public_key,
+    const struct BCM_mldsa65_private_key *private_key);
+
+OPENSSL_EXPORT bcm_status BCM_mldsa65_sign(
+    uint8_t out_encoded_signature[BCM_MLDSA65_SIGNATURE_BYTES],
+    const struct BCM_mldsa65_private_key *private_key, const uint8_t *msg,
+    size_t msg_len, const uint8_t *context, size_t context_len);
+
+OPENSSL_EXPORT bcm_status BCM_mldsa65_verify(
+    const struct BCM_mldsa65_public_key *public_key,
+    const uint8_t signature[BCM_MLDSA65_SIGNATURE_BYTES], const uint8_t *msg,
+    size_t msg_len, const uint8_t *context, size_t context_len);
+
+OPENSSL_EXPORT bcm_status BCM_mldsa65_marshal_public_key(
+    CBB *out, const struct BCM_mldsa65_public_key *public_key);
+
+OPENSSL_EXPORT bcm_status BCM_mldsa65_parse_public_key(
+    struct BCM_mldsa65_public_key *public_key, CBS *in);
+
+OPENSSL_EXPORT bcm_status BCM_mldsa65_parse_private_key(
+    struct BCM_mldsa65_private_key *private_key, CBS *in);
+
+// BCM_mldsa65_generate_key_external_entropy generates a public/private key pair
+// using the given seed, writes the encoded public key to
+// |out_encoded_public_key| and sets |out_private_key| to the private key.
+OPENSSL_EXPORT bcm_status BCM_mldsa65_generate_key_external_entropy(
+    uint8_t out_encoded_public_key[BCM_MLDSA65_PUBLIC_KEY_BYTES],
+    struct BCM_mldsa65_private_key *out_private_key,
+    const uint8_t entropy[BCM_MLDSA_SEED_BYTES]);
+
+// BCM_mldsa5_sign_internal signs |msg| using |private_key| and writes the
+// signature to |out_encoded_signature|. The |context_prefix| and |context| are
+// prefixed to the message, in that order, before signing. The |randomizer|
+// value can be set to zero bytes in order to make a deterministic signature, or
+// else filled with entropy for the usual |MLDSA_sign| behavior.
+OPENSSL_EXPORT bcm_status BCM_mldsa65_sign_internal(
+    uint8_t out_encoded_signature[BCM_MLDSA65_SIGNATURE_BYTES],
+    const struct BCM_mldsa65_private_key *private_key, const uint8_t *msg,
+    size_t msg_len, const uint8_t *context_prefix, size_t context_prefix_len,
+    const uint8_t *context, size_t context_len,
+    const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]);
+
+// BCM_mldsa5_verify_internal verifies that |encoded_signature| is a valid
+// signature of |msg| by |public_key|. The |context_prefix| and |context| are
+// prefixed to the message before verification, in that order.
+OPENSSL_EXPORT bcm_status BCM_mldsa65_verify_internal(
+    const struct BCM_mldsa65_public_key *public_key,
+    const uint8_t encoded_signature[BCM_MLDSA65_SIGNATURE_BYTES],
+    const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix,
+    size_t context_prefix_len, const uint8_t *context, size_t context_len);
+
+// BCM_mldsa65_marshal_private_key serializes |private_key| to |out| in the
+// NIST format for ML-DSA-65 private keys.
+OPENSSL_EXPORT bcm_status BCM_mldsa65_marshal_private_key(
+    CBB *out, const struct BCM_mldsa65_private_key *private_key);
+
+
 #if defined(__cplusplus)
 }  // extern C
 #endif
diff --git a/crypto/keccak/internal.h b/crypto/fipsmodule/keccak/internal.h
similarity index 100%
rename from crypto/keccak/internal.h
rename to crypto/fipsmodule/keccak/internal.h
diff --git a/crypto/keccak/keccak.cc b/crypto/fipsmodule/keccak/keccak.cc.inc
similarity index 98%
rename from crypto/keccak/keccak.cc
rename to crypto/fipsmodule/keccak/keccak.cc.inc
index b1e640a..2d8567f 100644
--- a/crypto/keccak/keccak.cc
+++ b/crypto/fipsmodule/keccak/keccak.cc.inc
@@ -17,7 +17,7 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "../internal.h"
+#include "../../internal.h"
 #include "./internal.h"
 
 
diff --git a/crypto/keccak/keccak_test.cc b/crypto/fipsmodule/keccak/keccak_test.cc
similarity index 98%
rename from crypto/keccak/keccak_test.cc
rename to crypto/fipsmodule/keccak/keccak_test.cc
index c90eec4..d8fdcb1 100644
--- a/crypto/keccak/keccak_test.cc
+++ b/crypto/fipsmodule/keccak/keccak_test.cc
@@ -20,8 +20,8 @@
 
 #include <openssl/bytestring.h>
 
-#include "../test/file_test.h"
-#include "../test/test_util.h"
+#include "../../test/file_test.h"
+#include "../../test/test_util.h"
 #include "./internal.h"
 
 
@@ -90,7 +90,7 @@
 }
 
 TEST(KeccakTest, KeccakTestVectors) {
-  FileTestGTest("crypto/keccak/keccak_tests.txt", KeccakFileTest);
+  FileTestGTest("crypto/fipsmodule/keccak/keccak_tests.txt", KeccakFileTest);
 }
 
 TEST(KeccakTest, MultiPass) {
diff --git a/crypto/keccak/keccak_tests.txt b/crypto/fipsmodule/keccak/keccak_tests.txt
similarity index 100%
rename from crypto/keccak/keccak_tests.txt
rename to crypto/fipsmodule/keccak/keccak_tests.txt
diff --git a/crypto/fipsmodule/mldsa/mldsa.cc.inc b/crypto/fipsmodule/mldsa/mldsa.cc.inc
new file mode 100644
index 0000000..7ce4d38
--- /dev/null
+++ b/crypto/fipsmodule/mldsa/mldsa.cc.inc
@@ -0,0 +1,1780 @@
+/* Copyright 2014 The BoringSSL Authors
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/base.h>
+
+#include <memory>
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include <openssl/bytestring.h>
+#include <openssl/mem.h>
+#include <openssl/rand.h>
+
+#include "../../internal.h"
+#include "../bcm_interface.h"
+#include "../keccak/internal.h"
+
+namespace mldsa {
+namespace {
+
+constexpr int kDegree = 256;
+constexpr int kRhoBytes = 32;
+constexpr int kSigmaBytes = 64;
+constexpr int kKBytes = 32;
+constexpr int kTrBytes = 64;
+constexpr int kMuBytes = 64;
+constexpr int kRhoPrimeBytes = 64;
+
+// 2^23 - 2^13 + 1
+constexpr uint32_t kPrime = 8380417;
+// Inverse of -kPrime modulo 2^32
+constexpr uint32_t kPrimeNegInverse = 4236238847;
+constexpr int kDroppedBits = 13;
+constexpr uint32_t kHalfPrime = (kPrime - 1) / 2;
+constexpr uint32_t kGamma2 = (kPrime - 1) / 32;
+// 256^-1 mod kPrime, in Montgomery form.
+constexpr uint32_t kInverseDegreeMontgomery = 41978;
+
+// Constants that vary depending on ML-DSA size.
+//
+// These are implemented as templates which take the K parameter to distinguish
+// the ML-DSA sizes. (At the time of writing, `if constexpr` was not available.)
+//
+// TODO(crbug.com/42290600): Switch this to `if constexpr` when C++17 is
+// available.
+
+template <int K>
+constexpr size_t public_key_bytes();
+
+template <>
+constexpr size_t public_key_bytes<6>() {
+  return BCM_MLDSA65_PUBLIC_KEY_BYTES;
+}
+
+template <int K>
+constexpr size_t signature_bytes();
+
+template <>
+constexpr size_t signature_bytes<6>() {
+  return BCM_MLDSA65_SIGNATURE_BYTES;
+}
+
+template <int K>
+constexpr int tau();
+
+template <>
+constexpr int tau<6>() {
+  return 49;
+}
+
+template <int K>
+constexpr int lambda_bytes();
+
+template <>
+constexpr int lambda_bytes<6>() {
+  return 192 / 8;
+}
+
+template <int K>
+constexpr int gamma1();
+
+template <>
+constexpr int gamma1<6>() {
+  return 1 << 19;
+}
+
+template <int K>
+constexpr int beta();
+
+template <>
+constexpr int beta<6>() {
+  return 196;
+}
+
+template <int K>
+constexpr int omega();
+
+template <>
+constexpr int omega<6>() {
+  return 55;
+}
+
+template <int K>
+constexpr int eta();
+
+template <>
+constexpr int eta<6>() {
+  return 4;
+}
+
+template <int K>
+constexpr int plus_minus_eta_bitlen();
+
+template <>
+constexpr int plus_minus_eta_bitlen<6>() {
+  return 4;
+}
+
+// Fundamental types.
+
+typedef struct scalar {
+  uint32_t c[kDegree];
+} scalar;
+
+template <int K>
+struct vector {
+  scalar v[K];
+};
+
+template <int K, int L>
+struct matrix {
+  scalar v[K][L];
+};
+
+/* Arithmetic */
+
+// This bit of Python will be referenced in some of the following comments:
+//
+// q = 8380417
+// # Inverse of -q modulo 2^32
+// q_neg_inverse = 4236238847
+// # 2^64 modulo q
+// montgomery_square = 2365951
+//
+// def bitreverse(i):
+//     ret = 0
+//     for n in range(8):
+//         bit = i & 1
+//         ret <<= 1
+//         ret |= bit
+//         i >>= 1
+//     return ret
+//
+// def montgomery_reduce(x):
+//     a = (x * q_neg_inverse) % 2**32
+//     b = x + a * q
+//     assert b & 0xFFFF_FFFF == 0
+//     c = b >> 32
+//     assert c < q
+//     return c
+//
+// def montgomery_transform(x):
+//     return montgomery_reduce(x * montgomery_square)
+
+// kNTTRootsMontgomery = [
+//   montgomery_transform(pow(1753, bitreverse(i), q)) for i in range(256)
+// ]
+static const uint32_t kNTTRootsMontgomery[256] = {
+    4193792, 25847,   5771523, 7861508, 237124,  7602457, 7504169, 466468,
+    1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103,
+    2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868,
+    6262231, 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005,
+    2706023, 95776,   3077325, 3530437, 6718724, 4788269, 5842901, 3915439,
+    4519302, 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118,
+    6681150, 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596,
+    811944,  531354,  954230,  3881043, 3900724, 5823537, 2071892, 5582638,
+    4450022, 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196,
+    7122806, 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922,
+    3412210, 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370,
+    7709315, 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987,
+    5037034, 264944,  508951,  3097992, 44288,   7280319, 904516,  3958618,
+    4656075, 8371839, 1653064, 5130689, 2389356, 8169440, 759969,  7063561,
+    189548,  4827145, 3159746, 6529015, 5971092, 8202977, 1315589, 1341330,
+    1285669, 6795489, 7567685, 6940675, 5361315, 4499357, 4751448, 3839961,
+    2091667, 3407706, 2316500, 3817976, 5037939, 2244091, 5933984, 4817955,
+    266997,  2434439, 7144689, 3513181, 4860065, 4621053, 7183191, 5187039,
+    900702,  1859098, 909542,  819034,  495491,  6767243, 8337157, 7857917,
+    7725090, 5257975, 2031748, 3207046, 4823422, 7855319, 7611795, 4784579,
+    342297,  286988,  5942594, 4108315, 3437287, 5038140, 1735879, 203044,
+    2842341, 2691481, 5790267, 1265009, 4055324, 1247620, 2486353, 1595974,
+    4613401, 1250494, 2635921, 4832145, 5386378, 1869119, 1903435, 7329447,
+    7047359, 1237275, 5062207, 6950192, 7929317, 1312455, 3306115, 6417775,
+    7100756, 1917081, 5834105, 7005614, 1500165, 777191,  2235880, 3406031,
+    7838005, 5548557, 6709241, 6533464, 5796124, 4656147, 594136,  4603424,
+    6366809, 2432395, 2454455, 8215696, 1957272, 3369112, 185531,  7173032,
+    5196991, 162844,  1616392, 3014001, 810149,  1652634, 4686184, 6581310,
+    5341501, 3523897, 3866901, 269760,  2213111, 7404533, 1717735, 472078,
+    7953734, 1723600, 6577327, 1910376, 6712985, 7276084, 8119771, 4546524,
+    5441381, 6144432, 7959518, 6094090, 183443,  7403526, 1612842, 4834730,
+    7826001, 3919660, 8332111, 7018208, 3937738, 1400424, 7534263, 1976782};
+
+// Reduces x mod kPrime in constant time, where 0 <= x < 2*kPrime.
+uint32_t reduce_once(uint32_t x) {
+  declassify_assert(x < 2 * kPrime);
+  // return x < kPrime ? x : x - kPrime;
+  return constant_time_select_int(constant_time_lt_w(x, kPrime), x, x - kPrime);
+}
+
+// Returns the absolute value in constant time.
+uint32_t abs_signed(uint32_t x) {
+  // return is_positive(x) ? x : -x;
+  // Note: MSVC doesn't like applying the unary minus operator to unsigned types
+  // (warning C4146), so we write the negation as a bitwise not plus one
+  // (assuming two's complement representation).
+  return constant_time_select_int(constant_time_lt_w(x, 0x80000000), x, 0u - x);
+}
+
+// Returns the absolute value modulo kPrime.
+uint32_t abs_mod_prime(uint32_t x) {
+  declassify_assert(x < kPrime);
+  // return x > kHalfPrime ? kPrime - x : x;
+  return constant_time_select_int(constant_time_lt_w(kHalfPrime, x), kPrime - x,
+                                  x);
+}
+
+// Returns the maximum of two values in constant time.
+uint32_t maximum(uint32_t x, uint32_t y) {
+  // return x < y ? y : x;
+  return constant_time_select_int(constant_time_lt_w(x, y), y, x);
+}
+
+uint32_t mod_sub(uint32_t a, uint32_t b) {
+  declassify_assert(a < kPrime);
+  declassify_assert(b < kPrime);
+  return reduce_once(kPrime + a - b);
+}
+
+void scalar_add(scalar *out, const scalar *lhs, const scalar *rhs) {
+  for (int i = 0; i < kDegree; i++) {
+    out->c[i] = reduce_once(lhs->c[i] + rhs->c[i]);
+  }
+}
+
+void scalar_sub(scalar *out, const scalar *lhs, const scalar *rhs) {
+  for (int i = 0; i < kDegree; i++) {
+    out->c[i] = mod_sub(lhs->c[i], rhs->c[i]);
+  }
+}
+
+uint32_t reduce_montgomery(uint64_t x) {
+  declassify_assert(x <= ((uint64_t)kPrime << 32));
+  uint64_t a = (uint32_t)x * kPrimeNegInverse;
+  uint64_t b = x + a * kPrime;
+  declassify_assert((b & 0xffffffff) == 0);
+  uint32_t c = b >> 32;
+  return reduce_once(c);
+}
+
+// Multiply two scalars in the number theoretically transformed state.
+void scalar_mult(scalar *out, const scalar *lhs, const scalar *rhs) {
+  for (int i = 0; i < kDegree; i++) {
+    out->c[i] = reduce_montgomery((uint64_t)lhs->c[i] * (uint64_t)rhs->c[i]);
+  }
+}
+
+// In place number theoretic transform of a given scalar.
+//
+// FIPS 204, Algorithm 41 (`NTT`).
+static void scalar_ntt(scalar *s) {
+  // Step: 1, 2, 4, 8, ..., 128
+  // Offset: 128, 64, 32, 16, ..., 1
+  int offset = kDegree;
+  for (int step = 1; step < kDegree; step <<= 1) {
+    offset >>= 1;
+    int k = 0;
+    for (int i = 0; i < step; i++) {
+      assert(k == 2 * offset * i);
+      const uint32_t step_root = kNTTRootsMontgomery[step + i];
+      for (int j = k; j < k + offset; j++) {
+        uint32_t even = s->c[j];
+        // |reduce_montgomery| works on values up to kPrime*R and R > 2*kPrime.
+        // |step_root| < kPrime because it's static data. |s->c[...]| is <
+        // kPrime by the invariants of that struct.
+        uint32_t odd =
+            reduce_montgomery((uint64_t)step_root * (uint64_t)s->c[j + offset]);
+        s->c[j] = reduce_once(odd + even);
+        s->c[j + offset] = mod_sub(even, odd);
+      }
+      k += 2 * offset;
+    }
+  }
+}
+
+// In place inverse number theoretic transform of a given scalar.
+//
+// FIPS 204, Algorithm 42 (`NTT^-1`).
+void scalar_inverse_ntt(scalar *s) {
+  // Step: 128, 64, 32, 16, ..., 1
+  // Offset: 1, 2, 4, 8, ..., 128
+  int step = kDegree;
+  for (int offset = 1; offset < kDegree; offset <<= 1) {
+    step >>= 1;
+    int k = 0;
+    for (int i = 0; i < step; i++) {
+      assert(k == 2 * offset * i);
+      const uint32_t step_root =
+          kPrime - kNTTRootsMontgomery[step + (step - 1 - i)];
+      for (int j = k; j < k + offset; j++) {
+        uint32_t even = s->c[j];
+        uint32_t odd = s->c[j + offset];
+        s->c[j] = reduce_once(odd + even);
+
+        // |reduce_montgomery| works on values up to kPrime*R and R > 2*kPrime.
+        // kPrime + even < 2*kPrime because |even| < kPrime, by the invariants
+        // of that structure. Thus kPrime + even - odd < 2*kPrime because odd >=
+        // 0, because it's unsigned and less than kPrime. Lastly step_root <
+        // kPrime, because |kNTTRootsMontgomery| is static data.
+        s->c[j + offset] = reduce_montgomery((uint64_t)step_root *
+                                             (uint64_t)(kPrime + even - odd));
+      }
+      k += 2 * offset;
+    }
+  }
+  for (int i = 0; i < kDegree; i++) {
+    s->c[i] = reduce_montgomery((uint64_t)s->c[i] *
+                                (uint64_t)kInverseDegreeMontgomery);
+  }
+}
+
+template <int X>
+void vector_zero(vector<X> *out) {
+  OPENSSL_memset(out, 0, sizeof(*out));
+}
+
+template <int X>
+void vector_add(vector<X> *out, const vector<X> *lhs, const vector<X> *rhs) {
+  for (int i = 0; i < X; i++) {
+    scalar_add(&out->v[i], &lhs->v[i], &rhs->v[i]);
+  }
+}
+
+template <int X>
+void vector_sub(vector<X> *out, const vector<X> *lhs, const vector<X> *rhs) {
+  for (int i = 0; i < X; i++) {
+    scalar_sub(&out->v[i], &lhs->v[i], &rhs->v[i]);
+  }
+}
+
+template <int X>
+void vector_mult_scalar(vector<X> *out, const vector<X> *lhs,
+                        const scalar *rhs) {
+  for (int i = 0; i < X; i++) {
+    scalar_mult(&out->v[i], &lhs->v[i], rhs);
+  }
+}
+
+template <int X>
+void vector_ntt(vector<X> *a) {
+  for (int i = 0; i < X; i++) {
+    scalar_ntt(&a->v[i]);
+  }
+}
+
+template <int X>
+void vector_inverse_ntt(vector<X> *a) {
+  for (int i = 0; i < X; i++) {
+    scalar_inverse_ntt(&a->v[i]);
+  }
+}
+
+template <int K, int L>
+void matrix_mult(vector<K> *out, const matrix<K, L> *m, const vector<L> *a) {
+  vector_zero(out);
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < L; j++) {
+      scalar product;
+      scalar_mult(&product, &m->v[i][j], &a->v[j]);
+      scalar_add(&out->v[i], &out->v[i], &product);
+    }
+  }
+}
+
+/* Rounding & hints */
+
+// FIPS 204, Algorithm 35 (`Power2Round`).
+void power2_round(uint32_t *r1, uint32_t *r0, uint32_t r) {
+  *r1 = r >> kDroppedBits;
+  *r0 = r - (*r1 << kDroppedBits);
+
+  uint32_t r0_adjusted = mod_sub(*r0, 1 << kDroppedBits);
+  uint32_t r1_adjusted = *r1 + 1;
+
+  // Mask is set iff r0 > 2^(dropped_bits - 1).
+  crypto_word_t mask =
+      constant_time_lt_w((uint32_t)(1 << (kDroppedBits - 1)), *r0);
+  // r0 = mask ? r0_adjusted : r0
+  *r0 = constant_time_select_int(mask, r0_adjusted, *r0);
+  // r1 = mask ? r1_adjusted : r1
+  *r1 = constant_time_select_int(mask, r1_adjusted, *r1);
+}
+
+// Scale back previously rounded value.
+void scale_power2_round(uint32_t *out, uint32_t r1) {
+  // Pre-condition: 0 <= r1 <= 2^10 - 1
+  assert(r1 < (1u << 10));
+
+  *out = r1 << kDroppedBits;
+
+  // Post-condition: 0 <= out <= 2^23 - 2^13 = kPrime - 1
+  assert(*out < kPrime);
+}
+
+// FIPS 204, Algorithm 37 (`HighBits`).
+uint32_t high_bits(uint32_t x) {
+  // Reference description (given 0 <= x < q):
+  //
+  // ```
+  // int32_t r0 = x mod+- (2 * kGamma2);
+  // if (x - r0 == q - 1) {
+  //   return 0;
+  // } else {
+  //   return (x - r0) / (2 * kGamma2);
+  // }
+  // ```
+  //
+  // Below is the formula taken from the reference implementation.
+  //
+  // Here, kGamma2 == 2^18 - 2^8
+  // This returns ((ceil(x / 2^7) * (2^10 + 1) + 2^21) / 2^22) mod 2^4
+  uint32_t r1 = (x + 127) >> 7;
+  r1 = (r1 * 1025 + (1 << 21)) >> 22;
+  r1 &= 15;
+  return r1;
+}
+
+// FIPS 204, Algorithm 36 (`Decompose`).
+void decompose(uint32_t *r1, int32_t *r0, uint32_t r) {
+  *r1 = high_bits(r);
+
+  *r0 = r;
+  *r0 -= *r1 * 2 * (int32_t)kGamma2;
+  *r0 -= (((int32_t)kHalfPrime - *r0) >> 31) & (int32_t)kPrime;
+}
+
+// FIPS 204, Algorithm 38 (`LowBits`).
+int32_t low_bits(uint32_t x) {
+  uint32_t r1;
+  int32_t r0;
+  decompose(&r1, &r0, x);
+  return r0;
+}
+
+// FIPS 204, Algorithm 39 (`MakeHint`).
+//
+// In the spec this takes two arguments, z and r, and is called with
+//   z = -ct0
+//   r = w - cs2 + ct0
+//
+// It then computes HighBits (algorithm 37) of z and z+r. But z+r is just w -
+// cs2, so this takes three arguments and saves an addition.
+int32_t make_hint(uint32_t ct0, uint32_t cs2, uint32_t w) {
+  uint32_t r_plus_z = mod_sub(w, cs2);
+  uint32_t r = reduce_once(r_plus_z + ct0);
+  return high_bits(r) != high_bits(r_plus_z);
+}
+
+// FIPS 204, Algorithm 40 (`UseHint`).
+uint32_t use_hint_vartime(uint32_t h, uint32_t r) {
+  uint32_t r1;
+  int32_t r0;
+  decompose(&r1, &r0, r);
+
+  if (h) {
+    if (r0 > 0) {
+      // m = 16, thus |mod m| in the spec turns into |& 15|.
+      return (r1 + 1) & 15;
+    } else {
+      return (r1 - 1) & 15;
+    }
+  }
+  return r1;
+}
+
+void scalar_power2_round(scalar *s1, scalar *s0, const scalar *s) {
+  for (int i = 0; i < kDegree; i++) {
+    power2_round(&s1->c[i], &s0->c[i], s->c[i]);
+  }
+}
+
+void scalar_scale_power2_round(scalar *out, const scalar *in) {
+  for (int i = 0; i < kDegree; i++) {
+    scale_power2_round(&out->c[i], in->c[i]);
+  }
+}
+
+void scalar_high_bits(scalar *out, const scalar *in) {
+  for (int i = 0; i < kDegree; i++) {
+    out->c[i] = high_bits(in->c[i]);
+  }
+}
+
+void scalar_low_bits(scalar *out, const scalar *in) {
+  for (int i = 0; i < kDegree; i++) {
+    out->c[i] = low_bits(in->c[i]);
+  }
+}
+
+void scalar_max(uint32_t *max, const scalar *s) {
+  for (int i = 0; i < kDegree; i++) {
+    uint32_t abs = abs_mod_prime(s->c[i]);
+    *max = maximum(*max, abs);
+  }
+}
+
+void scalar_max_signed(uint32_t *max, const scalar *s) {
+  for (int i = 0; i < kDegree; i++) {
+    uint32_t abs = abs_signed(s->c[i]);
+    *max = maximum(*max, abs);
+  }
+}
+
+void scalar_make_hint(scalar *out, const scalar *ct0, const scalar *cs2,
+                      const scalar *w) {
+  for (int i = 0; i < kDegree; i++) {
+    out->c[i] = make_hint(ct0->c[i], cs2->c[i], w->c[i]);
+  }
+}
+
+void scalar_use_hint_vartime(scalar *out, const scalar *h, const scalar *r) {
+  for (int i = 0; i < kDegree; i++) {
+    out->c[i] = use_hint_vartime(h->c[i], r->c[i]);
+  }
+}
+
+template <int X>
+void vector_power2_round(vector<X> *t1, vector<X> *t0, const vector<X> *t) {
+  for (int i = 0; i < X; i++) {
+    scalar_power2_round(&t1->v[i], &t0->v[i], &t->v[i]);
+  }
+}
+
+template <int X>
+void vector_scale_power2_round(vector<X> *out, const vector<X> *in) {
+  for (int i = 0; i < X; i++) {
+    scalar_scale_power2_round(&out->v[i], &in->v[i]);
+  }
+}
+
+template <int X>
+void vector_high_bits(vector<X> *out, const vector<X> *in) {
+  for (int i = 0; i < X; i++) {
+    scalar_high_bits(&out->v[i], &in->v[i]);
+  }
+}
+
+template <int X>
+void vector_low_bits(vector<X> *out, const vector<X> *in) {
+  for (int i = 0; i < X; i++) {
+    scalar_low_bits(&out->v[i], &in->v[i]);
+  }
+}
+
+template <int X>
+uint32_t vector_max(const vector<X> *a) {
+  uint32_t max = 0;
+  for (int i = 0; i < X; i++) {
+    scalar_max(&max, &a->v[i]);
+  }
+  return max;
+}
+
+template <int X>
+uint32_t vector_max_signed(const vector<X> *a) {
+  uint32_t max = 0;
+  for (int i = 0; i < X; i++) {
+    scalar_max_signed(&max, &a->v[i]);
+  }
+  return max;
+}
+
+// The input vector contains only zeroes and ones.
+template <int X>
+size_t vector_count_ones(const vector<X> *a) {
+  size_t count = 0;
+  for (int i = 0; i < X; i++) {
+    for (int j = 0; j < kDegree; j++) {
+      count += a->v[i].c[j];
+    }
+  }
+  return count;
+}
+
+template <int X>
+void vector_make_hint(vector<X> *out, const vector<X> *ct0,
+                      const vector<X> *cs2, const vector<X> *w) {
+  for (int i = 0; i < X; i++) {
+    scalar_make_hint(&out->v[i], &ct0->v[i], &cs2->v[i], &w->v[i]);
+  }
+}
+
+template <int X>
+void vector_use_hint_vartime(vector<X> *out, const vector<X> *h,
+                             const vector<X> *r) {
+  for (int i = 0; i < X; i++) {
+    scalar_use_hint_vartime(&out->v[i], &h->v[i], &r->v[i]);
+  }
+}
+
+/* Bit packing */
+
+// FIPS 204, Algorithm 16 (`SimpleBitPack`). Specialized to bitlen(b) = 4.
+static void scalar_encode_4(uint8_t out[128], const scalar *s) {
+  // Every two elements lands on a byte boundary.
+  static_assert(kDegree % 2 == 0, "kDegree must be a multiple of 2");
+  for (int i = 0; i < kDegree / 2; i++) {
+    uint32_t a = s->c[2 * i];
+    uint32_t b = s->c[2 * i + 1];
+    declassify_assert(a < 16);
+    declassify_assert(b < 16);
+    out[i] = a | (b << 4);
+  }
+}
+
+// FIPS 204, Algorithm 16 (`SimpleBitPack`). Specialized to bitlen(b) = 10.
+void scalar_encode_10(uint8_t out[320], const scalar *s) {
+  // Every four elements lands on a byte boundary.
+  static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4");
+  for (int i = 0; i < kDegree / 4; i++) {
+    uint32_t a = s->c[4 * i];
+    uint32_t b = s->c[4 * i + 1];
+    uint32_t c = s->c[4 * i + 2];
+    uint32_t d = s->c[4 * i + 3];
+    declassify_assert(a < 1024);
+    declassify_assert(b < 1024);
+    declassify_assert(c < 1024);
+    declassify_assert(d < 1024);
+    out[5 * i] = (uint8_t)a;
+    out[5 * i + 1] = (uint8_t)((a >> 8) | (b << 2));
+    out[5 * i + 2] = (uint8_t)((b >> 6) | (c << 4));
+    out[5 * i + 3] = (uint8_t)((c >> 4) | (d << 6));
+    out[5 * i + 4] = (uint8_t)(d >> 2);
+  }
+}
+
+// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(a+b) = 4 and b = 4.
+void scalar_encode_signed_4_4(uint8_t out[128], const scalar *s) {
+  // Every two elements lands on a byte boundary.
+  static_assert(kDegree % 2 == 0, "kDegree must be a multiple of 2");
+  for (int i = 0; i < kDegree / 2; i++) {
+    uint32_t a = mod_sub(4, s->c[2 * i]);
+    uint32_t b = mod_sub(4, s->c[2 * i + 1]);
+    declassify_assert(a < 16);
+    declassify_assert(b < 16);
+    out[i] = a | (b << 4);
+  }
+}
+
+// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(b) = 13 and b =
+// 2^12.
+void scalar_encode_signed_13_12(uint8_t out[416], const scalar *s) {
+  static const uint32_t kMax = 1u << 12;
+  // Every two elements lands on a byte boundary.
+  static_assert(kDegree % 8 == 0, "kDegree must be a multiple of 8");
+  for (int i = 0; i < kDegree / 8; i++) {
+    uint32_t a = mod_sub(kMax, s->c[8 * i]);
+    uint32_t b = mod_sub(kMax, s->c[8 * i + 1]);
+    uint32_t c = mod_sub(kMax, s->c[8 * i + 2]);
+    uint32_t d = mod_sub(kMax, s->c[8 * i + 3]);
+    uint32_t e = mod_sub(kMax, s->c[8 * i + 4]);
+    uint32_t f = mod_sub(kMax, s->c[8 * i + 5]);
+    uint32_t g = mod_sub(kMax, s->c[8 * i + 6]);
+    uint32_t h = mod_sub(kMax, s->c[8 * i + 7]);
+    declassify_assert(a < (1u << 13));
+    declassify_assert(b < (1u << 13));
+    declassify_assert(c < (1u << 13));
+    declassify_assert(d < (1u << 13));
+    declassify_assert(e < (1u << 13));
+    declassify_assert(f < (1u << 13));
+    declassify_assert(g < (1u << 13));
+    declassify_assert(h < (1u << 13));
+    a |= b << 13;
+    a |= c << 26;
+    c >>= 6;
+    c |= d << 7;
+    c |= e << 20;
+    e >>= 12;
+    e |= f << 1;
+    e |= g << 14;
+    e |= h << 27;
+    h >>= 5;
+    OPENSSL_memcpy(&out[13 * i], &a, sizeof(a));
+    OPENSSL_memcpy(&out[13 * i + 4], &c, sizeof(c));
+    OPENSSL_memcpy(&out[13 * i + 8], &e, sizeof(e));
+    OPENSSL_memcpy(&out[13 * i + 12], &h, 1);
+  }
+}
+
+// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(b) = 20 and b =
+// 2^19.
+void scalar_encode_signed_20_19(uint8_t out[640], const scalar *s) {
+  static const uint32_t kMax = 1u << 19;
+  // Every two elements lands on a byte boundary.
+  static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4");
+  for (int i = 0; i < kDegree / 4; i++) {
+    uint32_t a = mod_sub(kMax, s->c[4 * i]);
+    uint32_t b = mod_sub(kMax, s->c[4 * i + 1]);
+    uint32_t c = mod_sub(kMax, s->c[4 * i + 2]);
+    uint32_t d = mod_sub(kMax, s->c[4 * i + 3]);
+    declassify_assert(a < (1u << 20));
+    declassify_assert(b < (1u << 20));
+    declassify_assert(c < (1u << 20));
+    declassify_assert(d < (1u << 20));
+    a |= b << 20;
+    b >>= 12;
+    b |= c << 8;
+    b |= d << 28;
+    d >>= 4;
+    OPENSSL_memcpy(&out[10 * i], &a, sizeof(a));
+    OPENSSL_memcpy(&out[10 * i + 4], &b, sizeof(b));
+    OPENSSL_memcpy(&out[10 * i + 8], &d, 2);
+  }
+}
+
+// FIPS 204, Algorithm 17 (`BitPack`).
+void scalar_encode_signed(uint8_t *out, const scalar *s, int bits,
+                          uint32_t max) {
+  if (bits == 4) {
+    assert(max == 4);
+    scalar_encode_signed_4_4(out, s);
+  } else if (bits == 20) {
+    assert(max == 1u << 19);
+    scalar_encode_signed_20_19(out, s);
+  } else {
+    assert(bits == 13);
+    assert(max == 1u << 12);
+    scalar_encode_signed_13_12(out, s);
+  }
+}
+
+// FIPS 204, Algorithm 18 (`SimpleBitUnpack`). Specialized for bitlen(b) == 10.
+void scalar_decode_10(scalar *out, const uint8_t in[320]) {
+  uint32_t v;
+  static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4");
+  for (int i = 0; i < kDegree / 4; i++) {
+    OPENSSL_memcpy(&v, &in[5 * i], sizeof(v));
+    out->c[4 * i] = v & 0x3ff;
+    out->c[4 * i + 1] = (v >> 10) & 0x3ff;
+    out->c[4 * i + 2] = (v >> 20) & 0x3ff;
+    out->c[4 * i + 3] = (v >> 30) | (((uint32_t)in[5 * i + 4]) << 2);
+  }
+}
+
+// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 4 and b =
+// 4.
+int scalar_decode_signed_4_4(scalar *out, const uint8_t in[128]) {
+  uint32_t v;
+  static_assert(kDegree % 8 == 0, "kDegree must be a multiple of 8");
+  for (int i = 0; i < kDegree / 8; i++) {
+    OPENSSL_memcpy(&v, &in[4 * i], sizeof(v));
+    // None of the nibbles may be >= 9. So if the MSB of any nibble is set, none
+    // of the other bits may be set. First, select all the MSBs.
+    const uint32_t msbs = v & 0x88888888u;
+    // For each nibble where the MSB is set, form a mask of all the other bits.
+    const uint32_t mask = (msbs >> 1) | (msbs >> 2) | (msbs >> 3);
+    // A nibble is only out of range in the case of invalid input, in which case
+    // it is okay to leak the value.
+    if (constant_time_declassify_int((mask & v) != 0)) {
+      return 0;
+    }
+
+    out->c[i * 8] = mod_sub(4, v & 15);
+    out->c[i * 8 + 1] = mod_sub(4, (v >> 4) & 15);
+    out->c[i * 8 + 2] = mod_sub(4, (v >> 8) & 15);
+    out->c[i * 8 + 3] = mod_sub(4, (v >> 12) & 15);
+    out->c[i * 8 + 4] = mod_sub(4, (v >> 16) & 15);
+    out->c[i * 8 + 5] = mod_sub(4, (v >> 20) & 15);
+    out->c[i * 8 + 6] = mod_sub(4, (v >> 24) & 15);
+    out->c[i * 8 + 7] = mod_sub(4, v >> 28);
+  }
+  return 1;
+}
+
+// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 13 and b =
+// 2^12.
+void scalar_decode_signed_13_12(scalar *out, const uint8_t in[416]) {
+  static const uint32_t kMax = 1u << 12;
+  static const uint32_t k13Bits = (1u << 13) - 1;
+  static const uint32_t k7Bits = (1u << 7) - 1;
+
+  uint32_t a, b, c;
+  uint8_t d;
+  static_assert(kDegree % 8 == 0, "kDegree must be a multiple of 8");
+  for (int i = 0; i < kDegree / 8; i++) {
+    OPENSSL_memcpy(&a, &in[13 * i], sizeof(a));
+    OPENSSL_memcpy(&b, &in[13 * i + 4], sizeof(b));
+    OPENSSL_memcpy(&c, &in[13 * i + 8], sizeof(c));
+    d = in[13 * i + 12];
+
+    // It's not possible for a 13-bit number to be out of range when the max is
+    // 2^12.
+    out->c[i * 8] = mod_sub(kMax, a & k13Bits);
+    out->c[i * 8 + 1] = mod_sub(kMax, (a >> 13) & k13Bits);
+    out->c[i * 8 + 2] = mod_sub(kMax, (a >> 26) | ((b & k7Bits) << 6));
+    out->c[i * 8 + 3] = mod_sub(kMax, (b >> 7) & k13Bits);
+    out->c[i * 8 + 4] = mod_sub(kMax, (b >> 20) | ((c & 1) << 12));
+    out->c[i * 8 + 5] = mod_sub(kMax, (c >> 1) & k13Bits);
+    out->c[i * 8 + 6] = mod_sub(kMax, (c >> 14) & k13Bits);
+    out->c[i * 8 + 7] = mod_sub(kMax, (c >> 27) | ((uint32_t)d) << 5);
+  }
+}
+
+// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 20 and b =
+// 2^19.
+void scalar_decode_signed_20_19(scalar *out, const uint8_t in[640]) {
+  static const uint32_t kMax = 1u << 19;
+  static const uint32_t k20Bits = (1u << 20) - 1;
+
+  uint32_t a, b;
+  uint16_t c;
+  static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4");
+  for (int i = 0; i < kDegree / 4; i++) {
+    OPENSSL_memcpy(&a, &in[10 * i], sizeof(a));
+    OPENSSL_memcpy(&b, &in[10 * i + 4], sizeof(b));
+    OPENSSL_memcpy(&c, &in[10 * i + 8], sizeof(c));
+
+    // It's not possible for a 20-bit number to be out of range when the max is
+    // 2^19.
+    out->c[i * 4] = mod_sub(kMax, a & k20Bits);
+    out->c[i * 4 + 1] = mod_sub(kMax, (a >> 20) | ((b & 0xff) << 12));
+    out->c[i * 4 + 2] = mod_sub(kMax, (b >> 8) & k20Bits);
+    out->c[i * 4 + 3] = mod_sub(kMax, (b >> 28) | ((uint32_t)c) << 4);
+  }
+}
+
+// FIPS 204, Algorithm 19 (`BitUnpack`).
+int scalar_decode_signed(scalar *out, const uint8_t *in, int bits,
+                         uint32_t max) {
+  if (bits == 4) {
+    assert(max == 4);
+    return scalar_decode_signed_4_4(out, in);
+  } else if (bits == 13) {
+    assert(max == (1u << 12));
+    scalar_decode_signed_13_12(out, in);
+    return 1;
+  } else if (bits == 20) {
+    assert(max == (1u << 19));
+    scalar_decode_signed_20_19(out, in);
+    return 1;
+  } else {
+    abort();
+  }
+}
+
+/* Expansion functions */
+
+// FIPS 204, Algorithm 30 (`RejNTTPoly`).
+//
+// Rejection samples a Keccak stream to get uniformly distributed elements. This
+// is used for matrix expansion and only operates on public inputs.
+void scalar_from_keccak_vartime(scalar *out,
+                                const uint8_t derived_seed[kRhoBytes + 2]) {
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake128);
+  BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, kRhoBytes + 2);
+  assert(keccak_ctx.squeeze_offset == 0);
+  assert(keccak_ctx.rate_bytes == 168);
+  static_assert(168 % 3 == 0, "block and coefficient boundaries do not align");
+
+  int done = 0;
+  while (done < kDegree) {
+    uint8_t block[168];
+    BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+    for (size_t i = 0; i < sizeof(block) && done < kDegree; i += 3) {
+      // FIPS 204, Algorithm 14 (`CoeffFromThreeBytes`).
+      uint32_t value = (uint32_t)block[i] | ((uint32_t)block[i + 1] << 8) |
+                       (((uint32_t)block[i + 2] & 0x7f) << 16);
+      if (value < kPrime) {
+        out->c[done++] = value;
+      }
+    }
+  }
+}
+
+template <int ETA>
+static bool coefficient_from_nibble(uint32_t nibble, uint32_t *result);
+
+template <>
+bool coefficient_from_nibble<4>(uint32_t nibble, uint32_t *result) {
+  if (constant_time_declassify_int(nibble < 9)) {
+    *result = mod_sub(4, nibble);
+    return true;
+  }
+  return false;
+}
+
+// FIPS 204, Algorithm 31 (`RejBoundedPoly`).
+template <int ETA>
+void scalar_uniform(scalar *out, const uint8_t derived_seed[kSigmaBytes + 2]) {
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, kSigmaBytes + 2);
+  assert(keccak_ctx.squeeze_offset == 0);
+  assert(keccak_ctx.rate_bytes == 136);
+
+  int done = 0;
+  while (done < kDegree) {
+    uint8_t block[136];
+    BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+    for (size_t i = 0; i < sizeof(block) && done < kDegree; ++i) {
+      uint32_t t0 = block[i] & 0x0F;
+      uint32_t t1 = block[i] >> 4;
+      // FIPS 204, Algorithm 15 (`CoefFromHalfByte`). Although both the input
+      // and output here are secret, it is OK to leak when we rejected a byte.
+      // Individual bytes of the SHAKE-256 stream are (indistiguishable from)
+      // independent of each other and the original seed, so leaking information
+      // about the rejected bytes does not reveal the input or output.
+      uint32_t v;
+      if (coefficient_from_nibble<ETA>(t0, &v)) {
+        out->c[done++] = v;
+      }
+      if (done < kDegree && coefficient_from_nibble<ETA>(t1, &v)) {
+        out->c[done++] = v;
+      }
+    }
+  }
+}
+
+// FIPS 204, Algorithm 34 (`ExpandMask`), but just a single step.
+void scalar_sample_mask(scalar *out,
+                        const uint8_t derived_seed[kRhoPrimeBytes + 2]) {
+  uint8_t buf[640];
+  BORINGSSL_keccak(buf, sizeof(buf), derived_seed, kRhoPrimeBytes + 2,
+                   boringssl_shake256);
+
+  scalar_decode_signed_20_19(out, buf);
+}
+
+// FIPS 204, Algorithm 29 (`SampleInBall`).
+void scalar_sample_in_ball_vartime(scalar *out, const uint8_t *seed, int len,
+                                   int tau) {
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, seed, len);
+  assert(keccak_ctx.squeeze_offset == 0);
+  assert(keccak_ctx.rate_bytes == 136);
+
+  uint8_t block[136];
+  BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+
+  uint64_t signs = CRYPTO_load_u64_le(block);
+  int offset = 8;
+  // SampleInBall implements a Fisher–Yates shuffle, which unavoidably leaks
+  // where the zeros are by memory access pattern. Although this leak happens
+  // before bad signatures are rejected, this is safe. See
+  // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/8d8f01ac_70af3f21/
+  CONSTTIME_DECLASSIFY(block + offset, sizeof(block) - offset);
+
+  OPENSSL_memset(out, 0, sizeof(*out));
+  for (size_t i = kDegree - tau; i < kDegree; i++) {
+    size_t byte;
+    for (;;) {
+      if (offset == 136) {
+        BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+        // See above.
+        CONSTTIME_DECLASSIFY(block, sizeof(block));
+        offset = 0;
+      }
+
+      byte = block[offset++];
+      if (byte <= i) {
+        break;
+      }
+    }
+
+    out->c[i] = out->c[byte];
+    out->c[byte] = mod_sub(1, 2 * (signs & 1));
+    signs >>= 1;
+  }
+}
+
+// FIPS 204, Algorithm 32 (`ExpandA`).
+template <int K, int L>
+void matrix_expand(matrix<K, L> *out, const uint8_t rho[kRhoBytes]) {
+  static_assert(K <= 0x100, "K must fit in 8 bits");
+  static_assert(L <= 0x100, "L must fit in 8 bits");
+
+  uint8_t derived_seed[kRhoBytes + 2];
+  OPENSSL_memcpy(derived_seed, rho, kRhoBytes);
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < L; j++) {
+      derived_seed[kRhoBytes + 1] = (uint8_t)i;
+      derived_seed[kRhoBytes] = (uint8_t)j;
+      scalar_from_keccak_vartime(&out->v[i][j], derived_seed);
+    }
+  }
+}
+
+// FIPS 204, Algorithm 33 (`ExpandS`).
+template <int K, int L>
+void vector_expand_short(vector<L> *s1, vector<K> *s2,
+                         const uint8_t sigma[kSigmaBytes]) {
+  static_assert(K <= 0x100, "K must fit in 8 bits");
+  static_assert(L <= 0x100, "L must fit in 8 bits");
+  static_assert(K + L <= 0x100, "K+L must fit in 8 bits");
+
+  uint8_t derived_seed[kSigmaBytes + 2];
+  OPENSSL_memcpy(derived_seed, sigma, kSigmaBytes);
+  derived_seed[kSigmaBytes] = 0;
+  derived_seed[kSigmaBytes + 1] = 0;
+  for (int i = 0; i < L; i++) {
+    scalar_uniform<eta<K>()>(&s1->v[i], derived_seed);
+    ++derived_seed[kSigmaBytes];
+  }
+  for (int i = 0; i < K; i++) {
+    scalar_uniform<eta<K>()>(&s2->v[i], derived_seed);
+    ++derived_seed[kSigmaBytes];
+  }
+}
+
+// FIPS 204, Algorithm 34 (`ExpandMask`).
+template <int L>
+void vector_expand_mask(vector<L> *out, const uint8_t seed[kRhoPrimeBytes],
+                        size_t kappa) {
+  assert(kappa + L <= 0x10000);
+
+  uint8_t derived_seed[kRhoPrimeBytes + 2];
+  OPENSSL_memcpy(derived_seed, seed, kRhoPrimeBytes);
+  for (int i = 0; i < L; i++) {
+    size_t index = kappa + i;
+    derived_seed[kRhoPrimeBytes] = index & 0xFF;
+    derived_seed[kRhoPrimeBytes + 1] = (index >> 8) & 0xFF;
+    scalar_sample_mask(&out->v[i], derived_seed);
+  }
+}
+
+/* Encoding */
+
+// FIPS 204, Algorithm 16 (`SimpleBitPack`).
+//
+// Encodes an entire vector into 32*K*|bits| bytes. Note that since 256
+// (kDegree) is divisible by 8, the individual vector entries will always fill a
+// whole number of bytes, so we do not need to worry about bit packing here.
+template <int K>
+void vector_encode(uint8_t *out, const vector<K> *a, int bits) {
+  if (bits == 4) {
+    for (int i = 0; i < K; i++) {
+      scalar_encode_4(out + i * bits * kDegree / 8, &a->v[i]);
+    }
+  } else {
+    assert(bits == 10);
+    for (int i = 0; i < K; i++) {
+      scalar_encode_10(out + i * bits * kDegree / 8, &a->v[i]);
+    }
+  }
+}
+
+// FIPS 204, Algorithm 18 (`SimpleBitUnpack`).
+template <int K>
+void vector_decode_10(vector<K> *out, const uint8_t *in) {
+  for (int i = 0; i < K; i++) {
+    scalar_decode_10(&out->v[i], in + i * 10 * kDegree / 8);
+  }
+}
+
+// FIPS 204, Algorithm 17 (`BitPack`).
+//
+// Encodes an entire vector into 32*L*|bits| bytes. Note that since 256
+// (kDegree) is divisible by 8, the individual vector entries will always fill a
+// whole number of bytes, so we do not need to worry about bit packing here.
+template <int X>
+void vector_encode_signed(uint8_t *out, const vector<X> *a, int bits,
+                          uint32_t max) {
+  for (int i = 0; i < X; i++) {
+    scalar_encode_signed(out + i * bits * kDegree / 8, &a->v[i], bits, max);
+  }
+}
+
+template <int X>
+int vector_decode_signed(vector<X> *out, const uint8_t *in, int bits,
+                         uint32_t max) {
+  for (int i = 0; i < X; i++) {
+    if (!scalar_decode_signed(&out->v[i], in + i * bits * kDegree / 8, bits,
+                              max)) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+// FIPS 204, Algorithm 28 (`w1Encode`).
+template <int K>
+void w1_encode(uint8_t out[128 * K], const vector<K> *w1) {
+  vector_encode(out, w1, 4);
+}
+
+// FIPS 204, Algorithm 20 (`HintBitPack`).
+template <int K>
+void hint_bit_pack(uint8_t out[omega<K>() + K], const vector<K> *h) {
+  OPENSSL_memset(out, 0, omega<K>() + K);
+  int index = 0;
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < kDegree; j++) {
+      if (h->v[i].c[j]) {
+        // h must have at most omega<K>() non-zero coefficients.
+        BSSL_CHECK(index < omega<K>());
+        out[index++] = j;
+      }
+    }
+    out[omega<K>() + i] = index;
+  }
+}
+
+// FIPS 204, Algorithm 21 (`HintBitUnpack`).
+template <int K>
+int hint_bit_unpack(vector<K> *h, const uint8_t in[omega<K>() + K]) {
+  vector_zero(h);
+  int index = 0;
+  for (int i = 0; i < K; i++) {
+    const int limit = in[omega<K>() + i];
+    if (limit < index || limit > omega<K>()) {
+      return 0;
+    }
+
+    int last = -1;
+    while (index < limit) {
+      int byte = in[index++];
+      if (last >= 0 && byte <= last) {
+        return 0;
+      }
+      last = byte;
+      static_assert(kDegree == 256,
+                    "kDegree must be 256 for this write to be in bounds");
+      h->v[i].c[byte] = 1;
+    }
+  }
+  for (; index < omega<K>(); index++) {
+    if (in[index] != 0) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+template <int K>
+struct public_key {
+  uint8_t rho[kRhoBytes];
+  vector<K> t1;
+  // Pre-cached value(s).
+  uint8_t public_key_hash[kTrBytes];
+};
+
+template <int K, int L>
+struct private_key {
+  uint8_t rho[kRhoBytes];
+  uint8_t k[kKBytes];
+  uint8_t public_key_hash[kTrBytes];
+  vector<L> s1;
+  vector<K> s2;
+  vector<K> t0;
+};
+
+template <int K, int L>
+struct signature {
+  uint8_t c_tilde[2 * lambda_bytes<K>()];
+  vector<L> z;
+  vector<K> h;
+};
+
+// FIPS 204, Algorithm 22 (`pkEncode`).
+template <int K>
+int mldsa_marshal_public_key(CBB *out, const struct public_key<K> *pub) {
+  if (!CBB_add_bytes(out, pub->rho, sizeof(pub->rho))) {
+    return 0;
+  }
+
+  uint8_t *vectork_output;
+  if (!CBB_add_space(out, &vectork_output, 320 * K)) {
+    return 0;
+  }
+  vector_encode(vectork_output, &pub->t1, 10);
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 23 (`pkDecode`).
+template <int K>
+int mldsa_parse_public_key(struct public_key<K> *pub, CBS *in) {
+  const CBS orig_in = *in;
+
+  if (!CBS_copy_bytes(in, pub->rho, sizeof(pub->rho))) {
+    return 0;
+  }
+
+  CBS t1_bytes;
+  if (!CBS_get_bytes(in, &t1_bytes, 320 * K) || CBS_len(in) != 0) {
+    return 0;
+  }
+  vector_decode_10(&pub->t1, CBS_data(&t1_bytes));
+
+  // Compute pre-cached values.
+  BORINGSSL_keccak(pub->public_key_hash, sizeof(pub->public_key_hash),
+                   CBS_data(&orig_in), CBS_len(&orig_in), boringssl_shake256);
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 24 (`skEncode`).
+template <int K, int L>
+int mldsa_marshal_private_key(CBB *out, const struct private_key<K, L> *priv) {
+  if (!CBB_add_bytes(out, priv->rho, sizeof(priv->rho)) ||
+      !CBB_add_bytes(out, priv->k, sizeof(priv->k)) ||
+      !CBB_add_bytes(out, priv->public_key_hash,
+                     sizeof(priv->public_key_hash))) {
+    return 0;
+  }
+
+  constexpr size_t scalar_bytes =
+      (kDegree * plus_minus_eta_bitlen<K>() + 7) / 8;
+  uint8_t *vectorl_output;
+  if (!CBB_add_space(out, &vectorl_output, scalar_bytes * L)) {
+    return 0;
+  }
+  vector_encode_signed(vectorl_output, &priv->s1, plus_minus_eta_bitlen<K>(),
+                       eta<K>());
+
+  uint8_t *s2_output;
+  if (!CBB_add_space(out, &s2_output, scalar_bytes * K)) {
+    return 0;
+  }
+  vector_encode_signed(s2_output, &priv->s2, plus_minus_eta_bitlen<K>(),
+                       eta<K>());
+
+  uint8_t *t0_output;
+  if (!CBB_add_space(out, &t0_output, 416 * K)) {
+    return 0;
+  }
+  vector_encode_signed(t0_output, &priv->t0, 13, 1 << 12);
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 25 (`skDecode`).
+template <int K, int L>
+int mldsa_parse_private_key(struct private_key<K, L> *priv, CBS *in) {
+  CBS s1_bytes;
+  CBS s2_bytes;
+  CBS t0_bytes;
+  constexpr size_t scalar_bytes =
+      (kDegree * plus_minus_eta_bitlen<K>() + 7) / 8;
+  if (!CBS_copy_bytes(in, priv->rho, sizeof(priv->rho)) ||
+      !CBS_copy_bytes(in, priv->k, sizeof(priv->k)) ||
+      !CBS_copy_bytes(in, priv->public_key_hash,
+                      sizeof(priv->public_key_hash)) ||
+      !CBS_get_bytes(in, &s1_bytes, scalar_bytes * L) ||
+      !vector_decode_signed(&priv->s1, CBS_data(&s1_bytes),
+                            plus_minus_eta_bitlen<K>(), eta<K>()) ||
+      !CBS_get_bytes(in, &s2_bytes, scalar_bytes * K) ||
+      !vector_decode_signed(&priv->s2, CBS_data(&s2_bytes),
+                            plus_minus_eta_bitlen<K>(), eta<K>()) ||
+      !CBS_get_bytes(in, &t0_bytes, 416 * K) ||
+      // Note: Decoding 13 bits into (-2^12, 2^12] cannot fail.
+      !vector_decode_signed(&priv->t0, CBS_data(&t0_bytes), 13, 1 << 12)) {
+    return 0;
+  }
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 26 (`sigEncode`).
+template <int K, int L>
+int mldsa_marshal_signature(CBB *out, const struct signature<K, L> *sign) {
+  if (!CBB_add_bytes(out, sign->c_tilde, sizeof(sign->c_tilde))) {
+    return 0;
+  }
+
+  uint8_t *vectorl_output;
+  if (!CBB_add_space(out, &vectorl_output, 640 * L)) {
+    return 0;
+  }
+  vector_encode_signed(vectorl_output, &sign->z, 20, 1 << 19);
+
+  uint8_t *hint_output;
+  if (!CBB_add_space(out, &hint_output, omega<K>() + K)) {
+    return 0;
+  }
+  hint_bit_pack(hint_output, &sign->h);
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 27 (`sigDecode`).
+template <int K, int L>
+int mldsa_parse_signature(struct signature<K, L> *sign, CBS *in) {
+  CBS z_bytes;
+  CBS hint_bytes;
+  if (!CBS_copy_bytes(in, sign->c_tilde, sizeof(sign->c_tilde)) ||
+      !CBS_get_bytes(in, &z_bytes, 640 * L) ||
+      // Note: Decoding 20 bits into (-2^19, 2^19] cannot fail.
+      !vector_decode_signed(&sign->z, CBS_data(&z_bytes), 20, 1 << 19) ||
+      !CBS_get_bytes(in, &hint_bytes, omega<K>() + K) ||
+      !hint_bit_unpack(&sign->h, CBS_data(&hint_bytes))) {
+    return 0;
+  };
+
+  return 1;
+}
+
+template <typename T>
+struct DeleterFree {
+  void operator()(T *ptr) { OPENSSL_free(ptr); }
+};
+
+// FIPS 204, Algorithm 6 (`ML-DSA.KeyGen_internal`). Returns 1 on success and 0
+// on failure.
+template <int K, int L>
+int mldsa_generate_key_external_entropy(
+    uint8_t out_encoded_public_key[public_key_bytes<K>()],
+    struct private_key<K, L> *priv,
+    const uint8_t entropy[BCM_MLDSA_SEED_BYTES]) {
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    struct public_key<K> pub;
+    matrix<K, L> a_ntt;
+    vector<L> s1_ntt;
+    vector<K> t;
+  };
+  std::unique_ptr<values_st, DeleterFree<values_st>> values(
+      reinterpret_cast<struct values_st *>(OPENSSL_malloc(sizeof(values_st))));
+  if (values == NULL) {
+    return 0;
+  }
+
+  uint8_t augmented_entropy[BCM_MLDSA_SEED_BYTES + 2];
+  OPENSSL_memcpy(augmented_entropy, entropy, BCM_MLDSA_SEED_BYTES);
+  // The k and l parameters are appended to the seed.
+  augmented_entropy[BCM_MLDSA_SEED_BYTES] = K;
+  augmented_entropy[BCM_MLDSA_SEED_BYTES + 1] = L;
+  uint8_t expanded_seed[kRhoBytes + kSigmaBytes + kKBytes];
+  BORINGSSL_keccak(expanded_seed, sizeof(expanded_seed), augmented_entropy,
+                   sizeof(augmented_entropy), boringssl_shake256);
+  const uint8_t *const rho = expanded_seed;
+  const uint8_t *const sigma = expanded_seed + kRhoBytes;
+  const uint8_t *const k = expanded_seed + kRhoBytes + kSigmaBytes;
+  // rho is public.
+  CONSTTIME_DECLASSIFY(rho, kRhoBytes);
+  OPENSSL_memcpy(values->pub.rho, rho, sizeof(values->pub.rho));
+  OPENSSL_memcpy(priv->rho, rho, sizeof(priv->rho));
+  OPENSSL_memcpy(priv->k, k, sizeof(priv->k));
+
+  matrix_expand(&values->a_ntt, rho);
+  vector_expand_short(&priv->s1, &priv->s2, sigma);
+
+  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
+  vector_ntt(&values->s1_ntt);
+
+  matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt);
+  vector_inverse_ntt(&values->t);
+  vector_add(&values->t, &values->t, &priv->s2);
+
+  vector_power2_round(&values->pub.t1, &priv->t0, &values->t);
+  // t1 is public.
+  CONSTTIME_DECLASSIFY(&values->pub.t1, sizeof(values->pub.t1));
+
+  CBB cbb;
+  CBB_init_fixed(&cbb, out_encoded_public_key, public_key_bytes<K>());
+  if (!mldsa_marshal_public_key(&cbb, &values->pub)) {
+    return 0;
+  }
+  assert(CBB_len(&cbb) == public_key_bytes<K>());
+
+  BORINGSSL_keccak(priv->public_key_hash, sizeof(priv->public_key_hash),
+                   out_encoded_public_key, public_key_bytes<K>(),
+                   boringssl_shake256);
+
+  return 1;
+}
+
+template <int K, int L>
+int mldsa_public_from_private(struct public_key<K> *pub,
+                              const struct private_key<K, L> *priv) {
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    matrix<K, L> a_ntt;
+    vector<L> s1_ntt;
+    vector<K> t;
+    vector<K> t0;
+  };
+  std::unique_ptr<values_st, DeleterFree<values_st>> values(
+      reinterpret_cast<struct values_st *>(OPENSSL_malloc(sizeof(values_st))));
+  if (values == NULL) {
+    return 0;
+  }
+
+
+  OPENSSL_memcpy(pub->rho, priv->rho, sizeof(pub->rho));
+  OPENSSL_memcpy(pub->public_key_hash, priv->public_key_hash,
+                 sizeof(pub->public_key_hash));
+
+  matrix_expand(&values->a_ntt, priv->rho);
+
+  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
+  vector_ntt(&values->s1_ntt);
+
+  matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt);
+  vector_inverse_ntt(&values->t);
+  vector_add(&values->t, &values->t, &priv->s2);
+
+  vector_power2_round(&pub->t1, &values->t0, &values->t);
+  return 1;
+}
+
+// FIPS 204, Algorithm 7 (`ML-DSA.Sign_internal`). Returns 1 on success and 0
+// on failure.
+template <int K, int L>
+int mldsa_sign_internal(
+    uint8_t out_encoded_signature[signature_bytes<K>()],
+    const struct private_key<K, L> *priv, const uint8_t *msg, size_t msg_len,
+    const uint8_t *context_prefix, size_t context_prefix_len,
+    const uint8_t *context, size_t context_len,
+    const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) {
+  uint8_t mu[kMuBytes];
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, priv->public_key_hash,
+                          sizeof(priv->public_key_hash));
+  BORINGSSL_keccak_absorb(&keccak_ctx, context_prefix, context_prefix_len);
+  BORINGSSL_keccak_absorb(&keccak_ctx, context, context_len);
+  BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, mu, kMuBytes);
+
+  uint8_t rho_prime[kRhoPrimeBytes];
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, priv->k, sizeof(priv->k));
+  BORINGSSL_keccak_absorb(&keccak_ctx, randomizer,
+                          BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES);
+  BORINGSSL_keccak_absorb(&keccak_ctx, mu, kMuBytes);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, rho_prime, kRhoPrimeBytes);
+
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    struct signature<K, L> sign;
+    vector<L> s1_ntt;
+    vector<K> s2_ntt;
+    vector<K> t0_ntt;
+    matrix<K, L> a_ntt;
+    vector<L> y;
+    vector<K> w;
+    vector<K> w1;
+    vector<L> cs1;
+    vector<K> cs2;
+  };
+  std::unique_ptr<values_st, DeleterFree<values_st>> values(
+      reinterpret_cast<struct values_st *>(OPENSSL_malloc(sizeof(values_st))));
+  if (values == NULL) {
+    return 0;
+  }
+  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
+  vector_ntt(&values->s1_ntt);
+
+  OPENSSL_memcpy(&values->s2_ntt, &priv->s2, sizeof(values->s2_ntt));
+  vector_ntt(&values->s2_ntt);
+
+  OPENSSL_memcpy(&values->t0_ntt, &priv->t0, sizeof(values->t0_ntt));
+  vector_ntt(&values->t0_ntt);
+
+  matrix_expand(&values->a_ntt, priv->rho);
+
+  // kappa must not exceed 2**16/L = 13107. But the probability of it
+  // exceeding even 1000 iterations is vanishingly small.
+  for (size_t kappa = 0;; kappa += L) {
+    vector_expand_mask(&values->y, rho_prime, kappa);
+
+    vector<L> *y_ntt = &values->cs1;
+    OPENSSL_memcpy(y_ntt, &values->y, sizeof(*y_ntt));
+    vector_ntt(y_ntt);
+
+    matrix_mult(&values->w, &values->a_ntt, y_ntt);
+    vector_inverse_ntt(&values->w);
+
+    vector_high_bits(&values->w1, &values->w);
+    uint8_t w1_encoded[128 * K];
+    w1_encode(w1_encoded, &values->w1);
+
+    BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+    BORINGSSL_keccak_absorb(&keccak_ctx, mu, kMuBytes);
+    BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K);
+    BORINGSSL_keccak_squeeze(&keccak_ctx, values->sign.c_tilde,
+                             2 * lambda_bytes<K>());
+
+    scalar c_ntt;
+    scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde,
+                                  sizeof(values->sign.c_tilde), tau<K>());
+    scalar_ntt(&c_ntt);
+
+    vector_mult_scalar(&values->cs1, &values->s1_ntt, &c_ntt);
+    vector_inverse_ntt(&values->cs1);
+    vector_mult_scalar(&values->cs2, &values->s2_ntt, &c_ntt);
+    vector_inverse_ntt(&values->cs2);
+
+    vector_add(&values->sign.z, &values->y, &values->cs1);
+
+    vector<K> *r0 = &values->w1;
+    vector_sub(r0, &values->w, &values->cs2);
+    vector_low_bits(r0, r0);
+
+    // Leaking the fact that a signature was rejected is fine as the next
+    // attempt at a signature will be (indistinguishable from) independent of
+    // this one. Note, however, that we additionally leak which of the two
+    // branches rejected the signature. Section 5.5 of
+    // https://pq-crystals.org/dilithium/data/dilithium-specification-round3.pdf
+    // describes this leak as OK. Note we leak less than what is described by
+    // the paper; we do not reveal which coefficient violated the bound, and
+    // we hide which of the |z_max| or |r0_max| bound failed. See also
+    // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/2bbab0fa_d241d35a/
+    uint32_t z_max = vector_max(&values->sign.z);
+    uint32_t r0_max = vector_max_signed(r0);
+    if (constant_time_declassify_w(
+            constant_time_ge_w(z_max, gamma1<K>() - beta<K>()) |
+            constant_time_ge_w(r0_max, kGamma2 - beta<K>()))) {
+      continue;
+    }
+
+    vector<K> *ct0 = &values->w1;
+    vector_mult_scalar(ct0, &values->t0_ntt, &c_ntt);
+    vector_inverse_ntt(ct0);
+    vector_make_hint(&values->sign.h, ct0, &values->cs2, &values->w);
+
+    // See above.
+    uint32_t ct0_max = vector_max(ct0);
+    size_t h_ones = vector_count_ones(&values->sign.h);
+    if (constant_time_declassify_w(constant_time_ge_w(ct0_max, kGamma2) |
+                                   constant_time_lt_w(omega<K>(), h_ones))) {
+      continue;
+    }
+
+    // Although computed with the private key, the signature is public.
+    CONSTTIME_DECLASSIFY(values->sign.c_tilde, sizeof(values->sign.c_tilde));
+    CONSTTIME_DECLASSIFY(&values->sign.z, sizeof(values->sign.z));
+    CONSTTIME_DECLASSIFY(&values->sign.h, sizeof(values->sign.h));
+
+    CBB cbb;
+    CBB_init_fixed(&cbb, out_encoded_signature, signature_bytes<K>());
+    if (!mldsa_marshal_signature(&cbb, &values->sign)) {
+      return 0;
+    }
+
+    BSSL_CHECK(CBB_len(&cbb) == signature_bytes<K>());
+    return 1;
+  }
+}
+
+// FIPS 204, Algorithm 8 (`ML-DSA.Verify_internal`).
+template <int K, int L>
+int mldsa_verify_internal(const struct public_key<K> *pub,
+                          const uint8_t encoded_signature[signature_bytes<K>()],
+                          const uint8_t *msg, size_t msg_len,
+                          const uint8_t *context_prefix,
+                          size_t context_prefix_len, const uint8_t *context,
+                          size_t context_len) {
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    struct signature<K, L> sign;
+    matrix<K, L> a_ntt;
+    vector<L> z_ntt;
+    vector<K> az_ntt;
+    vector<K> ct1_ntt;
+  };
+  std::unique_ptr<values_st, DeleterFree<values_st>> values(
+      reinterpret_cast<struct values_st *>(OPENSSL_malloc(sizeof(values_st))));
+  if (values == NULL) {
+    return 0;
+  }
+
+  CBS cbs;
+  CBS_init(&cbs, encoded_signature, signature_bytes<K>());
+  if (!mldsa_parse_signature(&values->sign, &cbs)) {
+    return 0;
+  }
+
+  matrix_expand(&values->a_ntt, pub->rho);
+
+  uint8_t mu[kMuBytes];
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, pub->public_key_hash,
+                          sizeof(pub->public_key_hash));
+  BORINGSSL_keccak_absorb(&keccak_ctx, context_prefix, context_prefix_len);
+  BORINGSSL_keccak_absorb(&keccak_ctx, context, context_len);
+  BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, mu, kMuBytes);
+
+  scalar c_ntt;
+  scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde,
+                                sizeof(values->sign.c_tilde), tau<K>());
+  scalar_ntt(&c_ntt);
+
+  OPENSSL_memcpy(&values->z_ntt, &values->sign.z, sizeof(values->z_ntt));
+  vector_ntt(&values->z_ntt);
+
+  matrix_mult(&values->az_ntt, &values->a_ntt, &values->z_ntt);
+
+  vector_scale_power2_round(&values->ct1_ntt, &pub->t1);
+  vector_ntt(&values->ct1_ntt);
+
+  vector_mult_scalar(&values->ct1_ntt, &values->ct1_ntt, &c_ntt);
+
+  vector<K> *const w1 = &values->az_ntt;
+  vector_sub(w1, &values->az_ntt, &values->ct1_ntt);
+  vector_inverse_ntt(w1);
+
+  vector_use_hint_vartime(w1, &values->sign.h, w1);
+  uint8_t w1_encoded[128 * K];
+  w1_encode(w1_encoded, w1);
+
+  uint8_t c_tilde[2 * lambda_bytes<K>()];
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, mu, kMuBytes);
+  BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, c_tilde, 2 * lambda_bytes<K>());
+
+  uint32_t z_max = vector_max(&values->sign.z);
+  return z_max < static_cast<uint32_t>(gamma1<K>() - beta<K>()) &&
+         OPENSSL_memcmp(c_tilde, values->sign.c_tilde, 2 * lambda_bytes<K>()) ==
+             0;
+}
+
+struct private_key<6, 5> *private_key_from_external_65(
+    const struct BCM_mldsa65_private_key *external) {
+  static_assert(sizeof(struct BCM_mldsa65_private_key) ==
+                    sizeof(struct private_key<6, 5>),
+                "MLDSA65 private key size incorrect");
+  static_assert(alignof(struct BCM_mldsa65_private_key) ==
+                    alignof(struct private_key<6, 5>),
+                "MLDSA65 private key align incorrect");
+  return (struct private_key<6, 5> *)external;
+}
+
+struct public_key<6> *
+public_key_from_external_65(const struct BCM_mldsa65_public_key *external) {
+  static_assert(sizeof(struct BCM_mldsa65_public_key) ==
+                    sizeof(struct public_key<6>),
+                "MLDSA65 public key size incorrect");
+  static_assert(alignof(struct BCM_mldsa65_public_key) ==
+                    alignof(struct public_key<6>),
+                "MLDSA65 public key align incorrect");
+  return (struct public_key<6> *)external;
+}
+
+}  // namespace
+}  // namespace mldsa
+
+// ML-DSA-65 specific wrappers.
+
+bcm_status BCM_mldsa65_parse_public_key(
+    struct BCM_mldsa65_public_key *public_key, CBS *in) {
+  return bcm_as_approved_status(mldsa_parse_public_key(
+      mldsa::public_key_from_external_65(public_key), in));
+}
+
+bcm_status BCM_mldsa65_marshal_private_key(
+    CBB *out, const struct BCM_mldsa65_private_key *private_key) {
+  return bcm_as_approved_status(mldsa_marshal_private_key(
+      out, mldsa::private_key_from_external_65(private_key)));
+}
+
+bcm_status BCM_mldsa65_parse_private_key(
+    struct BCM_mldsa65_private_key *private_key, CBS *in) {
+  return bcm_as_approved_status(
+      mldsa_parse_private_key(mldsa::private_key_from_external_65(private_key),
+                              in) &&
+      CBS_len(in) == 0);
+}
+
+// Calls |MLDSA_generate_key_external_entropy| with random bytes from
+// |BCM_rand_bytes|.
+bcm_status BCM_mldsa65_generate_key(
+    uint8_t out_encoded_public_key[BCM_MLDSA65_PUBLIC_KEY_BYTES],
+    uint8_t out_seed[BCM_MLDSA_SEED_BYTES],
+    struct BCM_mldsa65_private_key *out_private_key) {
+  BCM_rand_bytes(out_seed, BCM_MLDSA_SEED_BYTES);
+  return BCM_mldsa65_generate_key_external_entropy(out_encoded_public_key,
+                                                   out_private_key, out_seed);
+}
+
+bcm_status BCM_mldsa65_private_key_from_seed(
+    struct BCM_mldsa65_private_key *out_private_key,
+    const uint8_t seed[BCM_MLDSA_SEED_BYTES]) {
+  uint8_t public_key[BCM_MLDSA65_PUBLIC_KEY_BYTES];
+  return BCM_mldsa65_generate_key_external_entropy(public_key, out_private_key,
+                                                   seed);
+}
+
+bcm_status BCM_mldsa65_generate_key_external_entropy(
+    uint8_t out_encoded_public_key[BCM_MLDSA65_PUBLIC_KEY_BYTES],
+    struct BCM_mldsa65_private_key *out_private_key,
+    const uint8_t entropy[BCM_MLDSA_SEED_BYTES]) {
+  return bcm_as_approved_status(mldsa_generate_key_external_entropy(
+      out_encoded_public_key,
+      mldsa::private_key_from_external_65(out_private_key), entropy));
+}
+
+bcm_status BCM_mldsa65_public_from_private(
+    struct BCM_mldsa65_public_key *out_public_key,
+    const struct BCM_mldsa65_private_key *private_key) {
+  return bcm_as_approved_status(mldsa_public_from_private(
+      mldsa::public_key_from_external_65(out_public_key),
+      mldsa::private_key_from_external_65(private_key)));
+}
+
+bcm_status BCM_mldsa65_sign_internal(
+    uint8_t out_encoded_signature[BCM_MLDSA65_SIGNATURE_BYTES],
+    const struct BCM_mldsa65_private_key *private_key, const uint8_t *msg,
+    size_t msg_len, const uint8_t *context_prefix, size_t context_prefix_len,
+    const uint8_t *context, size_t context_len,
+    const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) {
+  return bcm_as_approved_status(mldsa_sign_internal(
+      out_encoded_signature, mldsa::private_key_from_external_65(private_key),
+      msg, msg_len, context_prefix, context_prefix_len, context, context_len,
+      randomizer));
+}
+
+// ML-DSA signature in randomized mode, filling the random bytes with
+// |BCM_rand_bytes|.
+bcm_status BCM_mldsa65_sign(
+    uint8_t out_encoded_signature[BCM_MLDSA65_SIGNATURE_BYTES],
+    const struct BCM_mldsa65_private_key *private_key, const uint8_t *msg,
+    size_t msg_len, const uint8_t *context, size_t context_len) {
+  BSSL_CHECK(context_len <= 255);
+  uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES];
+  BCM_rand_bytes(randomizer, sizeof(randomizer));
+
+  const uint8_t context_prefix[2] = {0, static_cast<uint8_t>(context_len)};
+  return BCM_mldsa65_sign_internal(
+      out_encoded_signature, private_key, msg, msg_len, context_prefix,
+      sizeof(context_prefix), context, context_len, randomizer);
+}
+
+// FIPS 204, Algorithm 3 (`ML-DSA.Verify`).
+bcm_status BCM_mldsa65_verify(
+    const struct BCM_mldsa65_public_key *public_key,
+    const uint8_t signature[BCM_MLDSA65_SIGNATURE_BYTES], const uint8_t *msg,
+    size_t msg_len, const uint8_t *context, size_t context_len) {
+  BSSL_CHECK(context_len <= 255);
+  const uint8_t context_prefix[2] = {0, static_cast<uint8_t>(context_len)};
+  return BCM_mldsa65_verify_internal(public_key, signature, msg, msg_len,
+                                     context_prefix, sizeof(context_prefix),
+                                     context, context_len);
+}
+
+bcm_status BCM_mldsa65_verify_internal(
+    const struct BCM_mldsa65_public_key *public_key,
+    const uint8_t encoded_signature[BCM_MLDSA65_SIGNATURE_BYTES],
+    const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix,
+    size_t context_prefix_len, const uint8_t *context, size_t context_len) {
+  return bcm_as_approved_status(mldsa::mldsa_verify_internal<6, 5>(
+      mldsa::public_key_from_external_65(public_key), encoded_signature, msg,
+      msg_len, context_prefix, context_prefix_len, context, context_len));
+}
+
+bcm_status BCM_mldsa65_marshal_public_key(
+    CBB *out, const struct BCM_mldsa65_public_key *public_key) {
+  return bcm_as_approved_status(mldsa_marshal_public_key(
+      out, mldsa::public_key_from_external_65(public_key)));
+}
diff --git a/crypto/kyber/kyber.cc b/crypto/kyber/kyber.cc
index b58c0cb..4fcb6bf 100644
--- a/crypto/kyber/kyber.cc
+++ b/crypto/kyber/kyber.cc
@@ -21,8 +21,8 @@
 #include <openssl/bytestring.h>
 #include <openssl/rand.h>
 
+#include "../fipsmodule/keccak/internal.h"
 #include "../internal.h"
-#include "../keccak/internal.h"
 #include "./internal.h"
 
 
diff --git a/crypto/kyber/kyber_test.cc b/crypto/kyber/kyber_test.cc
index 84e210e..9fbd93d 100644
--- a/crypto/kyber/kyber_test.cc
+++ b/crypto/kyber/kyber_test.cc
@@ -23,9 +23,9 @@
 #define OPENSSL_UNSTABLE_EXPERIMENTAL_KYBER
 #include <openssl/experimental/kyber.h>
 
+#include "../fipsmodule/keccak/internal.h"
 #include "../test/file_test.h"
 #include "../test/test_util.h"
-#include "../keccak/internal.h"
 #include "./internal.h"
 
 
diff --git a/crypto/mldsa/internal.h b/crypto/mldsa/internal.h
deleted file mode 100644
index 95a19ec..0000000
--- a/crypto/mldsa/internal.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2024 The BoringSSL Authors
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-#ifndef OPENSSL_HEADER_CRYPTO_MLDSA_INTERNAL_H
-#define OPENSSL_HEADER_CRYPTO_MLDSA_INTERNAL_H
-
-#include <openssl/base.h>
-#include <openssl/mldsa.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-
-// MLDSA_SIGNATURE_RANDOMIZER_BYTES is the number of bytes of uniformly
-// random entropy necessary to generate a signature in randomized mode.
-#define MLDSA_SIGNATURE_RANDOMIZER_BYTES 32
-
-
-// ML-DSA-65
-
-// MLDSA65_generate_key_external_entropy generates a public/private key pair
-// using the given seed, writes the encoded public key to
-// |out_encoded_public_key| and sets |out_private_key| to the private key.
-// It returns 1 on success and 0 on failure.
-OPENSSL_EXPORT int MLDSA65_generate_key_external_entropy(
-    uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES],
-    struct MLDSA65_private_key *out_private_key,
-    const uint8_t entropy[MLDSA_SEED_BYTES]);
-
-// MLDSA65_sign_internal signs |msg| using |private_key| and writes the
-// signature to |out_encoded_signature|. The |context_prefix| and |context| are
-// prefixed to the message, in that order, before signing. The |randomizer|
-// value can be set to zero bytes in order to make a deterministic signature, or
-// else filled with entropy for the usual |MLDSA_sign| behavior. It returns 1 on
-// success and 0 on error.
-OPENSSL_EXPORT int MLDSA65_sign_internal(
-    uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES],
-    const struct MLDSA65_private_key *private_key, const uint8_t *msg,
-    size_t msg_len, const uint8_t *context_prefix, size_t context_prefix_len,
-    const uint8_t *context, size_t context_len,
-    const uint8_t randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES]);
-
-// MLDSA65_verify_internal verifies that |encoded_signature| is a valid
-// signature of |msg| by |public_key|. The |context_prefix| and |context| are
-// prefixed to the message before verification, in that order. It returns 1 on
-// success and 0 on error.
-OPENSSL_EXPORT int MLDSA65_verify_internal(
-    const struct MLDSA65_public_key *public_key,
-    const uint8_t encoded_signature[MLDSA65_SIGNATURE_BYTES],
-    const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix,
-    size_t context_prefix_len, const uint8_t *context, size_t context_len);
-
-// MLDSA65_marshal_private_key serializes |private_key| to |out| in the
-// NIST format for ML-DSA-65 private keys. It returns 1 on success or 0
-// on allocation error.
-OPENSSL_EXPORT int MLDSA65_marshal_private_key(
-    CBB *out, const struct MLDSA65_private_key *private_key);
-
-
-#if defined(__cplusplus)
-}  // extern C
-#endif
-
-#endif  // OPENSSL_HEADER_CRYPTO_MLDSA_INTERNAL_H
diff --git a/crypto/mldsa/mldsa.cc b/crypto/mldsa/mldsa.cc
index cdf199b..78c4bc6 100644
--- a/crypto/mldsa/mldsa.cc
+++ b/crypto/mldsa/mldsa.cc
@@ -14,1733 +14,45 @@
 
 #include <openssl/mldsa.h>
 
-#include <memory>
+#include "../fipsmodule/bcm_interface.h"
 
-#include <assert.h>
-#include <stdlib.h>
-
-#include <openssl/bytestring.h>
-#include <openssl/mem.h>
-#include <openssl/rand.h>
-
-#include "../internal.h"
-#include "../keccak/internal.h"
-#include "./internal.h"
-
-namespace {
-
-constexpr int kDegree = 256;
-constexpr int kRhoBytes = 32;
-constexpr int kSigmaBytes = 64;
-constexpr int kKBytes = 32;
-constexpr int kTrBytes = 64;
-constexpr int kMuBytes = 64;
-constexpr int kRhoPrimeBytes = 64;
-
-// 2^23 - 2^13 + 1
-constexpr uint32_t kPrime = 8380417;
-// Inverse of -kPrime modulo 2^32
-constexpr uint32_t kPrimeNegInverse = 4236238847;
-constexpr int kDroppedBits = 13;
-constexpr uint32_t kHalfPrime = (kPrime - 1) / 2;
-constexpr uint32_t kGamma2 = (kPrime - 1) / 32;
-// 256^-1 mod kPrime, in Montgomery form.
-constexpr uint32_t kInverseDegreeMontgomery = 41978;
-
-// Constants that vary depending on ML-DSA size.
-//
-// These are implemented as templates which take the K parameter to distinguish
-// the ML-DSA sizes. (At the time of writing, `if constexpr` was not available.)
-//
-// TODO(crbug.com/42290600): Switch this to `if constexpr` when C++17 is
-// available.
-
-template <int K>
-constexpr size_t public_key_bytes();
-
-template <>
-constexpr size_t public_key_bytes<6>() {
-  return MLDSA65_PUBLIC_KEY_BYTES;
-}
-
-template <int K>
-constexpr size_t signature_bytes();
-
-template <>
-constexpr size_t signature_bytes<6>() {
-  return MLDSA65_SIGNATURE_BYTES;
-}
-
-template <int K>
-constexpr int tau();
-
-template <>
-constexpr int tau<6>() {
-  return 49;
-}
-
-template <int K>
-constexpr int lambda_bytes();
-
-template <>
-constexpr int lambda_bytes<6>() {
-  return 192 / 8;
-}
-
-template <int K>
-constexpr int gamma1();
-
-template <>
-constexpr int gamma1<6>() {
-  return 1 << 19;
-}
-
-template <int K>
-constexpr int beta();
-
-template <>
-constexpr int beta<6>() {
-  return 196;
-}
-
-template <int K>
-constexpr int omega();
-
-template <>
-constexpr int omega<6>() {
-  return 55;
-}
-
-template <int K>
-constexpr int eta();
-
-template <>
-constexpr int eta<6>() {
-  return 4;
-}
-
-template <int K>
-constexpr int plus_minus_eta_bitlen();
-
-template <>
-constexpr int plus_minus_eta_bitlen<6>() {
-  return 4;
-}
-
-// Fundamental types.
-
-typedef struct scalar {
-  uint32_t c[kDegree];
-} scalar;
-
-template <int K>
-struct vector {
-  scalar v[K];
-};
-
-template <int K, int L>
-struct matrix {
-  scalar v[K][L];
-};
-
-/* Arithmetic */
-
-// This bit of Python will be referenced in some of the following comments:
-//
-// q = 8380417
-// # Inverse of -q modulo 2^32
-// q_neg_inverse = 4236238847
-// # 2^64 modulo q
-// montgomery_square = 2365951
-//
-// def bitreverse(i):
-//     ret = 0
-//     for n in range(8):
-//         bit = i & 1
-//         ret <<= 1
-//         ret |= bit
-//         i >>= 1
-//     return ret
-//
-// def montgomery_reduce(x):
-//     a = (x * q_neg_inverse) % 2**32
-//     b = x + a * q
-//     assert b & 0xFFFF_FFFF == 0
-//     c = b >> 32
-//     assert c < q
-//     return c
-//
-// def montgomery_transform(x):
-//     return montgomery_reduce(x * montgomery_square)
-
-// kNTTRootsMontgomery = [
-//   montgomery_transform(pow(1753, bitreverse(i), q)) for i in range(256)
-// ]
-static const uint32_t kNTTRootsMontgomery[256] = {
-    4193792, 25847,   5771523, 7861508, 237124,  7602457, 7504169, 466468,
-    1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103,
-    2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868,
-    6262231, 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005,
-    2706023, 95776,   3077325, 3530437, 6718724, 4788269, 5842901, 3915439,
-    4519302, 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118,
-    6681150, 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596,
-    811944,  531354,  954230,  3881043, 3900724, 5823537, 2071892, 5582638,
-    4450022, 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196,
-    7122806, 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922,
-    3412210, 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370,
-    7709315, 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987,
-    5037034, 264944,  508951,  3097992, 44288,   7280319, 904516,  3958618,
-    4656075, 8371839, 1653064, 5130689, 2389356, 8169440, 759969,  7063561,
-    189548,  4827145, 3159746, 6529015, 5971092, 8202977, 1315589, 1341330,
-    1285669, 6795489, 7567685, 6940675, 5361315, 4499357, 4751448, 3839961,
-    2091667, 3407706, 2316500, 3817976, 5037939, 2244091, 5933984, 4817955,
-    266997,  2434439, 7144689, 3513181, 4860065, 4621053, 7183191, 5187039,
-    900702,  1859098, 909542,  819034,  495491,  6767243, 8337157, 7857917,
-    7725090, 5257975, 2031748, 3207046, 4823422, 7855319, 7611795, 4784579,
-    342297,  286988,  5942594, 4108315, 3437287, 5038140, 1735879, 203044,
-    2842341, 2691481, 5790267, 1265009, 4055324, 1247620, 2486353, 1595974,
-    4613401, 1250494, 2635921, 4832145, 5386378, 1869119, 1903435, 7329447,
-    7047359, 1237275, 5062207, 6950192, 7929317, 1312455, 3306115, 6417775,
-    7100756, 1917081, 5834105, 7005614, 1500165, 777191,  2235880, 3406031,
-    7838005, 5548557, 6709241, 6533464, 5796124, 4656147, 594136,  4603424,
-    6366809, 2432395, 2454455, 8215696, 1957272, 3369112, 185531,  7173032,
-    5196991, 162844,  1616392, 3014001, 810149,  1652634, 4686184, 6581310,
-    5341501, 3523897, 3866901, 269760,  2213111, 7404533, 1717735, 472078,
-    7953734, 1723600, 6577327, 1910376, 6712985, 7276084, 8119771, 4546524,
-    5441381, 6144432, 7959518, 6094090, 183443,  7403526, 1612842, 4834730,
-    7826001, 3919660, 8332111, 7018208, 3937738, 1400424, 7534263, 1976782};
-
-// Reduces x mod kPrime in constant time, where 0 <= x < 2*kPrime.
-static uint32_t reduce_once(uint32_t x) {
-  declassify_assert(x < 2 * kPrime);
-  // return x < kPrime ? x : x - kPrime;
-  return constant_time_select_int(constant_time_lt_w(x, kPrime), x, x - kPrime);
-}
-
-// Returns the absolute value in constant time.
-static uint32_t abs_signed(uint32_t x) {
-  // return is_positive(x) ? x : -x;
-  // Note: MSVC doesn't like applying the unary minus operator to unsigned types
-  // (warning C4146), so we write the negation as a bitwise not plus one
-  // (assuming two's complement representation).
-  return constant_time_select_int(constant_time_lt_w(x, 0x80000000), x, 0u - x);
-}
-
-// Returns the absolute value modulo kPrime.
-static uint32_t abs_mod_prime(uint32_t x) {
-  declassify_assert(x < kPrime);
-  // return x > kHalfPrime ? kPrime - x : x;
-  return constant_time_select_int(constant_time_lt_w(kHalfPrime, x), kPrime - x,
-                                  x);
-}
-
-// Returns the maximum of two values in constant time.
-static uint32_t maximum(uint32_t x, uint32_t y) {
-  // return x < y ? y : x;
-  return constant_time_select_int(constant_time_lt_w(x, y), y, x);
-}
-
-static uint32_t mod_sub(uint32_t a, uint32_t b) {
-  declassify_assert(a < kPrime);
-  declassify_assert(b < kPrime);
-  return reduce_once(kPrime + a - b);
-}
-
-static void scalar_add(scalar *out, const scalar *lhs, const scalar *rhs) {
-  for (int i = 0; i < kDegree; i++) {
-    out->c[i] = reduce_once(lhs->c[i] + rhs->c[i]);
-  }
-}
-
-static void scalar_sub(scalar *out, const scalar *lhs, const scalar *rhs) {
-  for (int i = 0; i < kDegree; i++) {
-    out->c[i] = mod_sub(lhs->c[i], rhs->c[i]);
-  }
-}
-
-static uint32_t reduce_montgomery(uint64_t x) {
-  declassify_assert(x <= ((uint64_t)kPrime << 32));
-  uint64_t a = (uint32_t)x * kPrimeNegInverse;
-  uint64_t b = x + a * kPrime;
-  declassify_assert((b & 0xffffffff) == 0);
-  uint32_t c = b >> 32;
-  return reduce_once(c);
-}
-
-// Multiply two scalars in the number theoretically transformed state.
-static void scalar_mult(scalar *out, const scalar *lhs, const scalar *rhs) {
-  for (int i = 0; i < kDegree; i++) {
-    out->c[i] = reduce_montgomery((uint64_t)lhs->c[i] * (uint64_t)rhs->c[i]);
-  }
-}
-
-// In place number theoretic transform of a given scalar.
-//
-// FIPS 204, Algorithm 41 (`NTT`).
-static void scalar_ntt(scalar *s) {
-  // Step: 1, 2, 4, 8, ..., 128
-  // Offset: 128, 64, 32, 16, ..., 1
-  int offset = kDegree;
-  for (int step = 1; step < kDegree; step <<= 1) {
-    offset >>= 1;
-    int k = 0;
-    for (int i = 0; i < step; i++) {
-      assert(k == 2 * offset * i);
-      const uint32_t step_root = kNTTRootsMontgomery[step + i];
-      for (int j = k; j < k + offset; j++) {
-        uint32_t even = s->c[j];
-        // |reduce_montgomery| works on values up to kPrime*R and R > 2*kPrime.
-        // |step_root| < kPrime because it's static data. |s->c[...]| is <
-        // kPrime by the invariants of that struct.
-        uint32_t odd =
-            reduce_montgomery((uint64_t)step_root * (uint64_t)s->c[j + offset]);
-        s->c[j] = reduce_once(odd + even);
-        s->c[j + offset] = mod_sub(even, odd);
-      }
-      k += 2 * offset;
-    }
-  }
-}
-
-// In place inverse number theoretic transform of a given scalar.
-//
-// FIPS 204, Algorithm 42 (`NTT^-1`).
-static void scalar_inverse_ntt(scalar *s) {
-  // Step: 128, 64, 32, 16, ..., 1
-  // Offset: 1, 2, 4, 8, ..., 128
-  int step = kDegree;
-  for (int offset = 1; offset < kDegree; offset <<= 1) {
-    step >>= 1;
-    int k = 0;
-    for (int i = 0; i < step; i++) {
-      assert(k == 2 * offset * i);
-      const uint32_t step_root =
-          kPrime - kNTTRootsMontgomery[step + (step - 1 - i)];
-      for (int j = k; j < k + offset; j++) {
-        uint32_t even = s->c[j];
-        uint32_t odd = s->c[j + offset];
-        s->c[j] = reduce_once(odd + even);
-
-        // |reduce_montgomery| works on values up to kPrime*R and R > 2*kPrime.
-        // kPrime + even < 2*kPrime because |even| < kPrime, by the invariants
-        // of that structure. Thus kPrime + even - odd < 2*kPrime because odd >=
-        // 0, because it's unsigned and less than kPrime. Lastly step_root <
-        // kPrime, because |kNTTRootsMontgomery| is static data.
-        s->c[j + offset] = reduce_montgomery((uint64_t)step_root *
-                                             (uint64_t)(kPrime + even - odd));
-      }
-      k += 2 * offset;
-    }
-  }
-  for (int i = 0; i < kDegree; i++) {
-    s->c[i] = reduce_montgomery((uint64_t)s->c[i] *
-                                (uint64_t)kInverseDegreeMontgomery);
-  }
-}
-
-template <int X>
-static void vector_zero(vector<X> *out) {
-  OPENSSL_memset(out, 0, sizeof(*out));
-}
-
-template <int X>
-static void vector_add(vector<X> *out, const vector<X> *lhs,
-                       const vector<X> *rhs) {
-  for (int i = 0; i < X; i++) {
-    scalar_add(&out->v[i], &lhs->v[i], &rhs->v[i]);
-  }
-}
-
-template <int X>
-static void vector_sub(vector<X> *out, const vector<X> *lhs,
-                       const vector<X> *rhs) {
-  for (int i = 0; i < X; i++) {
-    scalar_sub(&out->v[i], &lhs->v[i], &rhs->v[i]);
-  }
-}
-
-template <int X>
-static void vector_mult_scalar(vector<X> *out, const vector<X> *lhs,
-                               const scalar *rhs) {
-  for (int i = 0; i < X; i++) {
-    scalar_mult(&out->v[i], &lhs->v[i], rhs);
-  }
-}
-
-template <int X>
-static void vector_ntt(vector<X> *a) {
-  for (int i = 0; i < X; i++) {
-    scalar_ntt(&a->v[i]);
-  }
-}
-
-template <int X>
-static void vector_inverse_ntt(vector<X> *a) {
-  for (int i = 0; i < X; i++) {
-    scalar_inverse_ntt(&a->v[i]);
-  }
-}
-
-template <int K, int L>
-static void matrix_mult(vector<K> *out, const matrix<K, L> *m,
-                        const vector<L> *a) {
-  vector_zero(out);
-  for (int i = 0; i < K; i++) {
-    for (int j = 0; j < L; j++) {
-      scalar product;
-      scalar_mult(&product, &m->v[i][j], &a->v[j]);
-      scalar_add(&out->v[i], &out->v[i], &product);
-    }
-  }
-}
-
-/* Rounding & hints */
-
-// FIPS 204, Algorithm 35 (`Power2Round`).
-static void power2_round(uint32_t *r1, uint32_t *r0, uint32_t r) {
-  *r1 = r >> kDroppedBits;
-  *r0 = r - (*r1 << kDroppedBits);
-
-  uint32_t r0_adjusted = mod_sub(*r0, 1 << kDroppedBits);
-  uint32_t r1_adjusted = *r1 + 1;
-
-  // Mask is set iff r0 > 2^(dropped_bits - 1).
-  crypto_word_t mask =
-      constant_time_lt_w((uint32_t)(1 << (kDroppedBits - 1)), *r0);
-  // r0 = mask ? r0_adjusted : r0
-  *r0 = constant_time_select_int(mask, r0_adjusted, *r0);
-  // r1 = mask ? r1_adjusted : r1
-  *r1 = constant_time_select_int(mask, r1_adjusted, *r1);
-}
-
-// Scale back previously rounded value.
-static void scale_power2_round(uint32_t *out, uint32_t r1) {
-  // Pre-condition: 0 <= r1 <= 2^10 - 1
-  assert(r1 < (1u << 10));
-
-  *out = r1 << kDroppedBits;
-
-  // Post-condition: 0 <= out <= 2^23 - 2^13 = kPrime - 1
-  assert(*out < kPrime);
-}
-
-// FIPS 204, Algorithm 37 (`HighBits`).
-static uint32_t high_bits(uint32_t x) {
-  // Reference description (given 0 <= x < q):
-  //
-  // ```
-  // int32_t r0 = x mod+- (2 * kGamma2);
-  // if (x - r0 == q - 1) {
-  //   return 0;
-  // } else {
-  //   return (x - r0) / (2 * kGamma2);
-  // }
-  // ```
-  //
-  // Below is the formula taken from the reference implementation.
-  //
-  // Here, kGamma2 == 2^18 - 2^8
-  // This returns ((ceil(x / 2^7) * (2^10 + 1) + 2^21) / 2^22) mod 2^4
-  uint32_t r1 = (x + 127) >> 7;
-  r1 = (r1 * 1025 + (1 << 21)) >> 22;
-  r1 &= 15;
-  return r1;
-}
-
-// FIPS 204, Algorithm 36 (`Decompose`).
-static void decompose(uint32_t *r1, int32_t *r0, uint32_t r) {
-  *r1 = high_bits(r);
-
-  *r0 = r;
-  *r0 -= *r1 * 2 * (int32_t)kGamma2;
-  *r0 -= (((int32_t)kHalfPrime - *r0) >> 31) & (int32_t)kPrime;
-}
-
-// FIPS 204, Algorithm 38 (`LowBits`).
-static int32_t low_bits(uint32_t x) {
-  uint32_t r1;
-  int32_t r0;
-  decompose(&r1, &r0, x);
-  return r0;
-}
-
-// FIPS 204, Algorithm 39 (`MakeHint`).
-//
-// In the spec this takes two arguments, z and r, and is called with
-//   z = -ct0
-//   r = w - cs2 + ct0
-//
-// It then computes HighBits (algorithm 37) of z and z+r. But z+r is just w -
-// cs2, so this takes three arguments and saves an addition.
-static int32_t make_hint(uint32_t ct0, uint32_t cs2, uint32_t w) {
-  uint32_t r_plus_z = mod_sub(w, cs2);
-  uint32_t r = reduce_once(r_plus_z + ct0);
-  return high_bits(r) != high_bits(r_plus_z);
-}
-
-// FIPS 204, Algorithm 40 (`UseHint`).
-static uint32_t use_hint_vartime(uint32_t h, uint32_t r) {
-  uint32_t r1;
-  int32_t r0;
-  decompose(&r1, &r0, r);
-
-  if (h) {
-    if (r0 > 0) {
-      // m = 16, thus |mod m| in the spec turns into |& 15|.
-      return (r1 + 1) & 15;
-    } else {
-      return (r1 - 1) & 15;
-    }
-  }
-  return r1;
-}
-
-static void scalar_power2_round(scalar *s1, scalar *s0, const scalar *s) {
-  for (int i = 0; i < kDegree; i++) {
-    power2_round(&s1->c[i], &s0->c[i], s->c[i]);
-  }
-}
-
-static void scalar_scale_power2_round(scalar *out, const scalar *in) {
-  for (int i = 0; i < kDegree; i++) {
-    scale_power2_round(&out->c[i], in->c[i]);
-  }
-}
-
-static void scalar_high_bits(scalar *out, const scalar *in) {
-  for (int i = 0; i < kDegree; i++) {
-    out->c[i] = high_bits(in->c[i]);
-  }
-}
-
-static void scalar_low_bits(scalar *out, const scalar *in) {
-  for (int i = 0; i < kDegree; i++) {
-    out->c[i] = low_bits(in->c[i]);
-  }
-}
-
-static void scalar_max(uint32_t *max, const scalar *s) {
-  for (int i = 0; i < kDegree; i++) {
-    uint32_t abs = abs_mod_prime(s->c[i]);
-    *max = maximum(*max, abs);
-  }
-}
-
-static void scalar_max_signed(uint32_t *max, const scalar *s) {
-  for (int i = 0; i < kDegree; i++) {
-    uint32_t abs = abs_signed(s->c[i]);
-    *max = maximum(*max, abs);
-  }
-}
-
-static void scalar_make_hint(scalar *out, const scalar *ct0, const scalar *cs2,
-                             const scalar *w) {
-  for (int i = 0; i < kDegree; i++) {
-    out->c[i] = make_hint(ct0->c[i], cs2->c[i], w->c[i]);
-  }
-}
-
-static void scalar_use_hint_vartime(scalar *out, const scalar *h,
-                                    const scalar *r) {
-  for (int i = 0; i < kDegree; i++) {
-    out->c[i] = use_hint_vartime(h->c[i], r->c[i]);
-  }
-}
-
-template <int X>
-static void vector_power2_round(vector<X> *t1, vector<X> *t0,
-                                const vector<X> *t) {
-  for (int i = 0; i < X; i++) {
-    scalar_power2_round(&t1->v[i], &t0->v[i], &t->v[i]);
-  }
-}
-
-template <int X>
-static void vector_scale_power2_round(vector<X> *out, const vector<X> *in) {
-  for (int i = 0; i < X; i++) {
-    scalar_scale_power2_round(&out->v[i], &in->v[i]);
-  }
-}
-
-template <int X>
-static void vector_high_bits(vector<X> *out, const vector<X> *in) {
-  for (int i = 0; i < X; i++) {
-    scalar_high_bits(&out->v[i], &in->v[i]);
-  }
-}
-
-template <int X>
-static void vector_low_bits(vector<X> *out, const vector<X> *in) {
-  for (int i = 0; i < X; i++) {
-    scalar_low_bits(&out->v[i], &in->v[i]);
-  }
-}
-
-template <int X>
-static uint32_t vector_max(const vector<X> *a) {
-  uint32_t max = 0;
-  for (int i = 0; i < X; i++) {
-    scalar_max(&max, &a->v[i]);
-  }
-  return max;
-}
-
-template <int X>
-static uint32_t vector_max_signed(const vector<X> *a) {
-  uint32_t max = 0;
-  for (int i = 0; i < X; i++) {
-    scalar_max_signed(&max, &a->v[i]);
-  }
-  return max;
-}
-
-// The input vector contains only zeroes and ones.
-template <int X>
-static size_t vector_count_ones(const vector<X> *a) {
-  size_t count = 0;
-  for (int i = 0; i < X; i++) {
-    for (int j = 0; j < kDegree; j++) {
-      count += a->v[i].c[j];
-    }
-  }
-  return count;
-}
-
-template <int X>
-static void vector_make_hint(vector<X> *out, const vector<X> *ct0,
-                             const vector<X> *cs2, const vector<X> *w) {
-  for (int i = 0; i < X; i++) {
-    scalar_make_hint(&out->v[i], &ct0->v[i], &cs2->v[i], &w->v[i]);
-  }
-}
-
-template <int X>
-static void vector_use_hint_vartime(vector<X> *out, const vector<X> *h,
-                                    const vector<X> *r) {
-  for (int i = 0; i < X; i++) {
-    scalar_use_hint_vartime(&out->v[i], &h->v[i], &r->v[i]);
-  }
-}
-
-/* Bit packing */
-
-// FIPS 204, Algorithm 16 (`SimpleBitPack`). Specialized to bitlen(b) = 4.
-static void scalar_encode_4(uint8_t out[128], const scalar *s) {
-  // Every two elements lands on a byte boundary.
-  static_assert(kDegree % 2 == 0, "kDegree must be a multiple of 2");
-  for (int i = 0; i < kDegree / 2; i++) {
-    uint32_t a = s->c[2 * i];
-    uint32_t b = s->c[2 * i + 1];
-    declassify_assert(a < 16);
-    declassify_assert(b < 16);
-    out[i] = a | (b << 4);
-  }
-}
-
-// FIPS 204, Algorithm 16 (`SimpleBitPack`). Specialized to bitlen(b) = 10.
-static void scalar_encode_10(uint8_t out[320], const scalar *s) {
-  // Every four elements lands on a byte boundary.
-  static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4");
-  for (int i = 0; i < kDegree / 4; i++) {
-    uint32_t a = s->c[4 * i];
-    uint32_t b = s->c[4 * i + 1];
-    uint32_t c = s->c[4 * i + 2];
-    uint32_t d = s->c[4 * i + 3];
-    declassify_assert(a < 1024);
-    declassify_assert(b < 1024);
-    declassify_assert(c < 1024);
-    declassify_assert(d < 1024);
-    out[5 * i] = (uint8_t)a;
-    out[5 * i + 1] = (uint8_t)((a >> 8) | (b << 2));
-    out[5 * i + 2] = (uint8_t)((b >> 6) | (c << 4));
-    out[5 * i + 3] = (uint8_t)((c >> 4) | (d << 6));
-    out[5 * i + 4] = (uint8_t)(d >> 2);
-  }
-}
-
-// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(b) = 4 and b = 4.
-static void scalar_encode_signed_4_4(uint8_t out[128], const scalar *s) {
-  // Every two elements lands on a byte boundary.
-  static_assert(kDegree % 2 == 0, "kDegree must be a multiple of 2");
-  for (int i = 0; i < kDegree / 2; i++) {
-    uint32_t a = mod_sub(4, s->c[2 * i]);
-    uint32_t b = mod_sub(4, s->c[2 * i + 1]);
-    declassify_assert(a < 16);
-    declassify_assert(b < 16);
-    out[i] = a | (b << 4);
-  }
-}
-
-// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(b) = 13 and b =
-// 2^12.
-static void scalar_encode_signed_13_12(uint8_t out[416], const scalar *s) {
-  static const uint32_t kMax = 1u << 12;
-  // Every two elements lands on a byte boundary.
-  static_assert(kDegree % 8 == 0, "kDegree must be a multiple of 8");
-  for (int i = 0; i < kDegree / 8; i++) {
-    uint32_t a = mod_sub(kMax, s->c[8 * i]);
-    uint32_t b = mod_sub(kMax, s->c[8 * i + 1]);
-    uint32_t c = mod_sub(kMax, s->c[8 * i + 2]);
-    uint32_t d = mod_sub(kMax, s->c[8 * i + 3]);
-    uint32_t e = mod_sub(kMax, s->c[8 * i + 4]);
-    uint32_t f = mod_sub(kMax, s->c[8 * i + 5]);
-    uint32_t g = mod_sub(kMax, s->c[8 * i + 6]);
-    uint32_t h = mod_sub(kMax, s->c[8 * i + 7]);
-    declassify_assert(a < (1u << 13));
-    declassify_assert(b < (1u << 13));
-    declassify_assert(c < (1u << 13));
-    declassify_assert(d < (1u << 13));
-    declassify_assert(e < (1u << 13));
-    declassify_assert(f < (1u << 13));
-    declassify_assert(g < (1u << 13));
-    declassify_assert(h < (1u << 13));
-    a |= b << 13;
-    a |= c << 26;
-    c >>= 6;
-    c |= d << 7;
-    c |= e << 20;
-    e >>= 12;
-    e |= f << 1;
-    e |= g << 14;
-    e |= h << 27;
-    h >>= 5;
-    OPENSSL_memcpy(&out[13 * i], &a, sizeof(a));
-    OPENSSL_memcpy(&out[13 * i + 4], &c, sizeof(c));
-    OPENSSL_memcpy(&out[13 * i + 8], &e, sizeof(e));
-    OPENSSL_memcpy(&out[13 * i + 12], &h, 1);
-  }
-}
-
-// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(b) = 20 and b =
-// 2^19.
-static void scalar_encode_signed_20_19(uint8_t out[640], const scalar *s) {
-  static const uint32_t kMax = 1u << 19;
-  // Every two elements lands on a byte boundary.
-  static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4");
-  for (int i = 0; i < kDegree / 4; i++) {
-    uint32_t a = mod_sub(kMax, s->c[4 * i]);
-    uint32_t b = mod_sub(kMax, s->c[4 * i + 1]);
-    uint32_t c = mod_sub(kMax, s->c[4 * i + 2]);
-    uint32_t d = mod_sub(kMax, s->c[4 * i + 3]);
-    declassify_assert(a < (1u << 20));
-    declassify_assert(b < (1u << 20));
-    declassify_assert(c < (1u << 20));
-    declassify_assert(d < (1u << 20));
-    a |= b << 20;
-    b >>= 12;
-    b |= c << 8;
-    b |= d << 28;
-    d >>= 4;
-    OPENSSL_memcpy(&out[10 * i], &a, sizeof(a));
-    OPENSSL_memcpy(&out[10 * i + 4], &b, sizeof(b));
-    OPENSSL_memcpy(&out[10 * i + 8], &d, 2);
-  }
-}
-
-// FIPS 204, Algorithm 17 (`BitPack`).
-static void scalar_encode_signed(uint8_t *out, const scalar *s, int bits,
-                                 uint32_t max) {
-  if (bits == 4) {
-    assert(max == 4);
-    scalar_encode_signed_4_4(out, s);
-  } else if (bits == 20) {
-    assert(max == 1u << 19);
-    scalar_encode_signed_20_19(out, s);
-  } else {
-    assert(bits == 13);
-    assert(max == 1u << 12);
-    scalar_encode_signed_13_12(out, s);
-  }
-}
-
-// FIPS 204, Algorithm 18 (`SimpleBitUnpack`). Specialized for bitlen(b) == 10.
-static void scalar_decode_10(scalar *out, const uint8_t in[320]) {
-  uint32_t v;
-  static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4");
-  for (int i = 0; i < kDegree / 4; i++) {
-    OPENSSL_memcpy(&v, &in[5 * i], sizeof(v));
-    out->c[4 * i] = v & 0x3ff;
-    out->c[4 * i + 1] = (v >> 10) & 0x3ff;
-    out->c[4 * i + 2] = (v >> 20) & 0x3ff;
-    out->c[4 * i + 3] = (v >> 30) | (((uint32_t)in[5 * i + 4]) << 2);
-  }
-}
-
-// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 4 and b =
-// 4.
-static int scalar_decode_signed_4_4(scalar *out, const uint8_t in[128]) {
-  uint32_t v;
-  static_assert(kDegree % 8 == 0, "kDegree must be a multiple of 8");
-  for (int i = 0; i < kDegree / 8; i++) {
-    OPENSSL_memcpy(&v, &in[4 * i], sizeof(v));
-    // None of the nibbles may be >= 9. So if the MSB of any nibble is set, none
-    // of the other bits may be set. First, select all the MSBs.
-    const uint32_t msbs = v & 0x88888888u;
-    // For each nibble where the MSB is set, form a mask of all the other bits.
-    const uint32_t mask = (msbs >> 1) | (msbs >> 2) | (msbs >> 3);
-    // A nibble is only out of range in the case of invalid input, in which case
-    // it is okay to leak the value.
-    if (constant_time_declassify_int((mask & v) != 0)) {
-      return 0;
-    }
-
-    out->c[i * 8] = mod_sub(4, v & 15);
-    out->c[i * 8 + 1] = mod_sub(4, (v >> 4) & 15);
-    out->c[i * 8 + 2] = mod_sub(4, (v >> 8) & 15);
-    out->c[i * 8 + 3] = mod_sub(4, (v >> 12) & 15);
-    out->c[i * 8 + 4] = mod_sub(4, (v >> 16) & 15);
-    out->c[i * 8 + 5] = mod_sub(4, (v >> 20) & 15);
-    out->c[i * 8 + 6] = mod_sub(4, (v >> 24) & 15);
-    out->c[i * 8 + 7] = mod_sub(4, v >> 28);
-  }
-  return 1;
-}
-
-// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 13 and b =
-// 2^12.
-static void scalar_decode_signed_13_12(scalar *out, const uint8_t in[416]) {
-  static const uint32_t kMax = 1u << 12;
-  static const uint32_t k13Bits = (1u << 13) - 1;
-  static const uint32_t k7Bits = (1u << 7) - 1;
-
-  uint32_t a, b, c;
-  uint8_t d;
-  static_assert(kDegree % 8 == 0, "kDegree must be a multiple of 8");
-  for (int i = 0; i < kDegree / 8; i++) {
-    OPENSSL_memcpy(&a, &in[13 * i], sizeof(a));
-    OPENSSL_memcpy(&b, &in[13 * i + 4], sizeof(b));
-    OPENSSL_memcpy(&c, &in[13 * i + 8], sizeof(c));
-    d = in[13 * i + 12];
-
-    // It's not possible for a 13-bit number to be out of range when the max is
-    // 2^12.
-    out->c[i * 8] = mod_sub(kMax, a & k13Bits);
-    out->c[i * 8 + 1] = mod_sub(kMax, (a >> 13) & k13Bits);
-    out->c[i * 8 + 2] = mod_sub(kMax, (a >> 26) | ((b & k7Bits) << 6));
-    out->c[i * 8 + 3] = mod_sub(kMax, (b >> 7) & k13Bits);
-    out->c[i * 8 + 4] = mod_sub(kMax, (b >> 20) | ((c & 1) << 12));
-    out->c[i * 8 + 5] = mod_sub(kMax, (c >> 1) & k13Bits);
-    out->c[i * 8 + 6] = mod_sub(kMax, (c >> 14) & k13Bits);
-    out->c[i * 8 + 7] = mod_sub(kMax, (c >> 27) | ((uint32_t)d) << 5);
-  }
-}
-
-// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 20 and b =
-// 2^19.
-static void scalar_decode_signed_20_19(scalar *out, const uint8_t in[640]) {
-  static const uint32_t kMax = 1u << 19;
-  static const uint32_t k20Bits = (1u << 20) - 1;
-
-  uint32_t a, b;
-  uint16_t c;
-  static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4");
-  for (int i = 0; i < kDegree / 4; i++) {
-    OPENSSL_memcpy(&a, &in[10 * i], sizeof(a));
-    OPENSSL_memcpy(&b, &in[10 * i + 4], sizeof(b));
-    OPENSSL_memcpy(&c, &in[10 * i + 8], sizeof(c));
-
-    // It's not possible for a 20-bit number to be out of range when the max is
-    // 2^19.
-    out->c[i * 4] = mod_sub(kMax, a & k20Bits);
-    out->c[i * 4 + 1] = mod_sub(kMax, (a >> 20) | ((b & 0xff) << 12));
-    out->c[i * 4 + 2] = mod_sub(kMax, (b >> 8) & k20Bits);
-    out->c[i * 4 + 3] = mod_sub(kMax, (b >> 28) | ((uint32_t)c) << 4);
-  }
-}
-
-// FIPS 204, Algorithm 19 (`BitUnpack`).
-static int scalar_decode_signed(scalar *out, const uint8_t *in, int bits,
-                                uint32_t max) {
-  if (bits == 4) {
-    assert(max == 4);
-    return scalar_decode_signed_4_4(out, in);
-  } else if (bits == 13) {
-    assert(max == (1u << 12));
-    scalar_decode_signed_13_12(out, in);
-    return 1;
-  } else if (bits == 20) {
-    assert(max == (1u << 19));
-    scalar_decode_signed_20_19(out, in);
-    return 1;
-  } else {
-    abort();
-  }
-}
-
-/* Expansion functions */
-
-// FIPS 204, Algorithm 30 (`RejNTTPoly`).
-//
-// Rejection samples a Keccak stream to get uniformly distributed elements. This
-// is used for matrix expansion and only operates on public inputs.
-static void scalar_from_keccak_vartime(
-    scalar *out, const uint8_t derived_seed[kRhoBytes + 2]) {
-  struct BORINGSSL_keccak_st keccak_ctx;
-  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake128);
-  BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, kRhoBytes + 2);
-  assert(keccak_ctx.squeeze_offset == 0);
-  assert(keccak_ctx.rate_bytes == 168);
-  static_assert(168 % 3 == 0, "block and coefficient boundaries do not align");
-
-  int done = 0;
-  while (done < kDegree) {
-    uint8_t block[168];
-    BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
-    for (size_t i = 0; i < sizeof(block) && done < kDegree; i += 3) {
-      // FIPS 204, Algorithm 14 (`CoeffFromThreeBytes`).
-      uint32_t value = (uint32_t)block[i] | ((uint32_t)block[i + 1] << 8) |
-                       (((uint32_t)block[i + 2] & 0x7f) << 16);
-      if (value < kPrime) {
-        out->c[done++] = value;
-      }
-    }
-  }
-}
-
-template <int ETA>
-static bool coefficient_from_nibble(uint32_t nibble, uint32_t *result);
-
-template <>
-bool coefficient_from_nibble<4>(uint32_t nibble, uint32_t *result) {
-  if (constant_time_declassify_int(nibble < 9)) {
-    *result = mod_sub(4, nibble);
-    return true;
-  }
-  return false;
-}
-
-// FIPS 204, Algorithm 31 (`RejBoundedPoly`).
-template <int ETA>
-static void scalar_uniform(scalar *out,
-                           const uint8_t derived_seed[kSigmaBytes + 2]) {
-  struct BORINGSSL_keccak_st keccak_ctx;
-  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
-  BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, kSigmaBytes + 2);
-  assert(keccak_ctx.squeeze_offset == 0);
-  assert(keccak_ctx.rate_bytes == 136);
-
-  int done = 0;
-  while (done < kDegree) {
-    uint8_t block[136];
-    BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
-    for (size_t i = 0; i < sizeof(block) && done < kDegree; ++i) {
-      uint32_t t0 = block[i] & 0x0F;
-      uint32_t t1 = block[i] >> 4;
-      // FIPS 204, Algorithm 15 (`CoefFromHalfByte`). Although both the input
-      // and output here are secret, it is OK to leak when we rejected a byte.
-      // Individual bytes of the SHAKE-256 stream are (indistiguishable from)
-      // independent of each other and the original seed, so leaking information
-      // about the rejected bytes does not reveal the input or output.
-      uint32_t v;
-      if (coefficient_from_nibble<ETA>(t0, &v)) {
-        out->c[done++] = v;
-      }
-      if (done < kDegree && coefficient_from_nibble<ETA>(t1, &v)) {
-        out->c[done++] = v;
-      }
-    }
-  }
-}
-
-// FIPS 204, Algorithm 34 (`ExpandMask`), but just a single step.
-static void scalar_sample_mask(scalar *out,
-                               const uint8_t derived_seed[kRhoPrimeBytes + 2]) {
-  uint8_t buf[640];
-  BORINGSSL_keccak(buf, sizeof(buf), derived_seed, kRhoPrimeBytes + 2,
-                   boringssl_shake256);
-
-  scalar_decode_signed_20_19(out, buf);
-}
-
-// FIPS 204, Algorithm 29 (`SampleInBall`).
-static void scalar_sample_in_ball_vartime(scalar *out, const uint8_t *seed,
-                                          int len, int tau) {
-  struct BORINGSSL_keccak_st keccak_ctx;
-  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
-  BORINGSSL_keccak_absorb(&keccak_ctx, seed, len);
-  assert(keccak_ctx.squeeze_offset == 0);
-  assert(keccak_ctx.rate_bytes == 136);
-
-  uint8_t block[136];
-  BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
-
-  uint64_t signs = CRYPTO_load_u64_le(block);
-  int offset = 8;
-  // SampleInBall implements a Fisher–Yates shuffle, which unavoidably leaks
-  // where the zeros are by memory access pattern. Although this leak happens
-  // before bad signatures are rejected, this is safe. See
-  // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/8d8f01ac_70af3f21/
-  CONSTTIME_DECLASSIFY(block + offset, sizeof(block) - offset);
-
-  OPENSSL_memset(out, 0, sizeof(*out));
-  for (size_t i = kDegree - tau; i < kDegree; i++) {
-    size_t byte;
-    for (;;) {
-      if (offset == 136) {
-        BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
-        // See above.
-        CONSTTIME_DECLASSIFY(block, sizeof(block));
-        offset = 0;
-      }
-
-      byte = block[offset++];
-      if (byte <= i) {
-        break;
-      }
-    }
-
-    out->c[i] = out->c[byte];
-    out->c[byte] = mod_sub(1, 2 * (signs & 1));
-    signs >>= 1;
-  }
-}
-
-// FIPS 204, Algorithm 32 (`ExpandA`).
-template <int K, int L>
-static void matrix_expand(matrix<K, L> *out, const uint8_t rho[kRhoBytes]) {
-  static_assert(K <= 0x100, "K must fit in 8 bits");
-  static_assert(L <= 0x100, "L must fit in 8 bits");
-
-  uint8_t derived_seed[kRhoBytes + 2];
-  OPENSSL_memcpy(derived_seed, rho, kRhoBytes);
-  for (int i = 0; i < K; i++) {
-    for (int j = 0; j < L; j++) {
-      derived_seed[kRhoBytes + 1] = (uint8_t)i;
-      derived_seed[kRhoBytes] = (uint8_t)j;
-      scalar_from_keccak_vartime(&out->v[i][j], derived_seed);
-    }
-  }
-}
-
-// FIPS 204, Algorithm 33 (`ExpandS`).
-template <int K, int L>
-static void vector_expand_short(vector<L> *s1, vector<K> *s2,
-                                const uint8_t sigma[kSigmaBytes]) {
-  static_assert(K <= 0x100, "K must fit in 8 bits");
-  static_assert(L <= 0x100, "L must fit in 8 bits");
-  static_assert(K + L <= 0x100, "K+L must fit in 8 bits");
-
-  uint8_t derived_seed[kSigmaBytes + 2];
-  OPENSSL_memcpy(derived_seed, sigma, kSigmaBytes);
-  derived_seed[kSigmaBytes] = 0;
-  derived_seed[kSigmaBytes + 1] = 0;
-  for (int i = 0; i < L; i++) {
-    scalar_uniform<eta<K>()>(&s1->v[i], derived_seed);
-    ++derived_seed[kSigmaBytes];
-  }
-  for (int i = 0; i < K; i++) {
-    scalar_uniform<eta<K>()>(&s2->v[i], derived_seed);
-    ++derived_seed[kSigmaBytes];
-  }
-}
-
-// FIPS 204, Algorithm 34 (`ExpandMask`).
-template <int L>
-static void vector_expand_mask(vector<L> *out,
-                               const uint8_t seed[kRhoPrimeBytes],
-                               size_t kappa) {
-  assert(kappa + L <= 0x10000);
-
-  uint8_t derived_seed[kRhoPrimeBytes + 2];
-  OPENSSL_memcpy(derived_seed, seed, kRhoPrimeBytes);
-  for (int i = 0; i < L; i++) {
-    size_t index = kappa + i;
-    derived_seed[kRhoPrimeBytes] = index & 0xFF;
-    derived_seed[kRhoPrimeBytes + 1] = (index >> 8) & 0xFF;
-    scalar_sample_mask(&out->v[i], derived_seed);
-  }
-}
-
-/* Encoding */
-
-// FIPS 204, Algorithm 16 (`SimpleBitPack`).
-//
-// Encodes an entire vector into 32*K*|bits| bytes. Note that since 256
-// (kDegree) is divisible by 8, the individual vector entries will always fill a
-// whole number of bytes, so we do not need to worry about bit packing here.
-template <int K>
-static void vector_encode(uint8_t *out, const vector<K> *a, int bits) {
-  if (bits == 4) {
-    for (int i = 0; i < K; i++) {
-      scalar_encode_4(out + i * bits * kDegree / 8, &a->v[i]);
-    }
-  } else {
-    assert(bits == 10);
-    for (int i = 0; i < K; i++) {
-      scalar_encode_10(out + i * bits * kDegree / 8, &a->v[i]);
-    }
-  }
-}
-
-// FIPS 204, Algorithm 18 (`SimpleBitUnpack`).
-template <int K>
-static void vector_decode_10(vector<K> *out, const uint8_t *in) {
-  for (int i = 0; i < K; i++) {
-    scalar_decode_10(&out->v[i], in + i * 10 * kDegree / 8);
-  }
-}
-
-// FIPS 204, Algorithm 17 (`BitPack`).
-//
-// Encodes an entire vector into 32*L*|bits| bytes. Note that since 256
-// (kDegree) is divisible by 8, the individual vector entries will always fill a
-// whole number of bytes, so we do not need to worry about bit packing here.
-template <int X>
-static void vector_encode_signed(uint8_t *out, const vector<X> *a, int bits,
-                                 uint32_t max) {
-  for (int i = 0; i < X; i++) {
-    scalar_encode_signed(out + i * bits * kDegree / 8, &a->v[i], bits, max);
-  }
-}
-
-template <int X>
-static int vector_decode_signed(vector<X> *out, const uint8_t *in, int bits,
-                                uint32_t max) {
-  for (int i = 0; i < X; i++) {
-    if (!scalar_decode_signed(&out->v[i], in + i * bits * kDegree / 8, bits,
-                              max)) {
-      return 0;
-    }
-  }
-  return 1;
-}
-
-// FIPS 204, Algorithm 28 (`w1Encode`).
-template <int K>
-static void w1_encode(uint8_t out[128 * K], const vector<K> *w1) {
-  vector_encode(out, w1, 4);
-}
-
-// FIPS 204, Algorithm 20 (`HintBitPack`).
-template <int K>
-static void hint_bit_pack(uint8_t out[omega<K>() + K], const vector<K> *h) {
-  OPENSSL_memset(out, 0, omega<K>() + K);
-  int index = 0;
-  for (int i = 0; i < K; i++) {
-    for (int j = 0; j < kDegree; j++) {
-      if (h->v[i].c[j]) {
-        // h must have at most omega<K>() non-zero coefficients.
-        BSSL_CHECK(index < omega<K>());
-        out[index++] = j;
-      }
-    }
-    out[omega<K>() + i] = index;
-  }
-}
-
-// FIPS 204, Algorithm 21 (`HintBitUnpack`).
-template <int K>
-static int hint_bit_unpack(vector<K> *h, const uint8_t in[omega<K>() + K]) {
-  vector_zero(h);
-  int index = 0;
-  for (int i = 0; i < K; i++) {
-    const int limit = in[omega<K>() + i];
-    if (limit < index || limit > omega<K>()) {
-      return 0;
-    }
-
-    int last = -1;
-    while (index < limit) {
-      int byte = in[index++];
-      if (last >= 0 && byte <= last) {
-        return 0;
-      }
-      last = byte;
-      static_assert(kDegree == 256,
-                    "kDegree must be 256 for this write to be in bounds");
-      h->v[i].c[byte] = 1;
-    }
-  }
-  for (; index < omega<K>(); index++) {
-    if (in[index] != 0) {
-      return 0;
-    }
-  }
-  return 1;
-}
-
-template <int K>
-struct public_key {
-  uint8_t rho[kRhoBytes];
-  vector<K> t1;
-  // Pre-cached value(s).
-  uint8_t public_key_hash[kTrBytes];
-};
-
-template <int K, int L>
-struct private_key {
-  uint8_t rho[kRhoBytes];
-  uint8_t k[kKBytes];
-  uint8_t public_key_hash[kTrBytes];
-  vector<L> s1;
-  vector<K> s2;
-  vector<K> t0;
-};
-
-template <int K, int L>
-struct signature {
-  uint8_t c_tilde[2 * lambda_bytes<K>()];
-  vector<L> z;
-  vector<K> h;
-};
-
-// FIPS 204, Algorithm 22 (`pkEncode`).
-template <int K>
-static int mldsa_marshal_public_key(CBB *out, const struct public_key<K> *pub) {
-  if (!CBB_add_bytes(out, pub->rho, sizeof(pub->rho))) {
-    return 0;
-  }
-
-  uint8_t *vectork_output;
-  if (!CBB_add_space(out, &vectork_output, 320 * K)) {
-    return 0;
-  }
-  vector_encode(vectork_output, &pub->t1, 10);
-
-  return 1;
-}
-
-// FIPS 204, Algorithm 23 (`pkDecode`).
-template <int K>
-static int mldsa_parse_public_key(struct public_key<K> *pub, CBS *in) {
-  const CBS orig_in = *in;
-
-  if (!CBS_copy_bytes(in, pub->rho, sizeof(pub->rho))) {
-    return 0;
-  }
-
-  CBS t1_bytes;
-  if (!CBS_get_bytes(in, &t1_bytes, 320 * K) || CBS_len(in) != 0) {
-    return 0;
-  }
-  vector_decode_10(&pub->t1, CBS_data(&t1_bytes));
-
-  // Compute pre-cached values.
-  BORINGSSL_keccak(pub->public_key_hash, sizeof(pub->public_key_hash),
-                   CBS_data(&orig_in), CBS_len(&orig_in), boringssl_shake256);
-
-  return 1;
-}
-
-// FIPS 204, Algorithm 24 (`skEncode`).
-template <int K, int L>
-static int mldsa_marshal_private_key(CBB *out,
-                                     const struct private_key<K, L> *priv) {
-  if (!CBB_add_bytes(out, priv->rho, sizeof(priv->rho)) ||
-      !CBB_add_bytes(out, priv->k, sizeof(priv->k)) ||
-      !CBB_add_bytes(out, priv->public_key_hash,
-                     sizeof(priv->public_key_hash))) {
-    return 0;
-  }
-
-  constexpr size_t scalar_bytes =
-      (kDegree * plus_minus_eta_bitlen<K>() + 7) / 8;
-  uint8_t *vectorl_output;
-  if (!CBB_add_space(out, &vectorl_output, scalar_bytes * L)) {
-    return 0;
-  }
-  vector_encode_signed(vectorl_output, &priv->s1, plus_minus_eta_bitlen<K>(),
-                       eta<K>());
-
-  uint8_t *s2_output;
-  if (!CBB_add_space(out, &s2_output, scalar_bytes * K)) {
-    return 0;
-  }
-  vector_encode_signed(s2_output, &priv->s2, plus_minus_eta_bitlen<K>(),
-                       eta<K>());
-
-  uint8_t *t0_output;
-  if (!CBB_add_space(out, &t0_output, 416 * K)) {
-    return 0;
-  }
-  vector_encode_signed(t0_output, &priv->t0, 13, 1 << 12);
-
-  return 1;
-}
-
-// FIPS 204, Algorithm 25 (`skDecode`).
-template <int K, int L>
-static int mldsa_parse_private_key(struct private_key<K, L> *priv, CBS *in) {
-  CBS s1_bytes;
-  CBS s2_bytes;
-  CBS t0_bytes;
-  constexpr size_t scalar_bytes =
-      (kDegree * plus_minus_eta_bitlen<K>() + 7) / 8;
-  if (!CBS_copy_bytes(in, priv->rho, sizeof(priv->rho)) ||
-      !CBS_copy_bytes(in, priv->k, sizeof(priv->k)) ||
-      !CBS_copy_bytes(in, priv->public_key_hash,
-                      sizeof(priv->public_key_hash)) ||
-      !CBS_get_bytes(in, &s1_bytes, scalar_bytes * L) ||
-      !vector_decode_signed(&priv->s1, CBS_data(&s1_bytes),
-                            plus_minus_eta_bitlen<K>(), eta<K>()) ||
-      !CBS_get_bytes(in, &s2_bytes, scalar_bytes * K) ||
-      !vector_decode_signed(&priv->s2, CBS_data(&s2_bytes),
-                            plus_minus_eta_bitlen<K>(), eta<K>()) ||
-      !CBS_get_bytes(in, &t0_bytes, 416 * K) ||
-      // Note: Decoding 13 bits into (-2^12, 2^12] cannot fail.
-      !vector_decode_signed(&priv->t0, CBS_data(&t0_bytes), 13, 1 << 12)) {
-    return 0;
-  }
-
-  return 1;
-}
-
-// FIPS 204, Algorithm 26 (`sigEncode`).
-template <int K, int L>
-static int mldsa_marshal_signature(CBB *out,
-                                   const struct signature<K, L> *sign) {
-  if (!CBB_add_bytes(out, sign->c_tilde, sizeof(sign->c_tilde))) {
-    return 0;
-  }
-
-  uint8_t *vectorl_output;
-  if (!CBB_add_space(out, &vectorl_output, 640 * L)) {
-    return 0;
-  }
-  vector_encode_signed(vectorl_output, &sign->z, 20, 1 << 19);
-
-  uint8_t *hint_output;
-  if (!CBB_add_space(out, &hint_output, omega<K>() + K)) {
-    return 0;
-  }
-  hint_bit_pack(hint_output, &sign->h);
-
-  return 1;
-}
-
-// FIPS 204, Algorithm 27 (`sigDecode`).
-template <int K, int L>
-static int mldsa_parse_signature(struct signature<K, L> *sign, CBS *in) {
-  CBS z_bytes;
-  CBS hint_bytes;
-  if (!CBS_copy_bytes(in, sign->c_tilde, sizeof(sign->c_tilde)) ||
-      !CBS_get_bytes(in, &z_bytes, 640 * L) ||
-      // Note: Decoding 20 bits into (-2^19, 2^19] cannot fail.
-      !vector_decode_signed(&sign->z, CBS_data(&z_bytes), 20, 1 << 19) ||
-      !CBS_get_bytes(in, &hint_bytes, omega<K>() + K) ||
-      !hint_bit_unpack(&sign->h, CBS_data(&hint_bytes))) {
-    return 0;
-  };
-
-  return 1;
-}
-
-template <typename T>
-struct DeleterFree {
-  void operator()(T *ptr) { OPENSSL_free(ptr); }
-};
-
-// FIPS 204, Algorithm 6 (`ML-DSA.KeyGen_internal`). Returns 1 on success and 0
-// on failure.
-template <int K, int L>
-static int mldsa_generate_key_external_entropy(
-    uint8_t out_encoded_public_key[public_key_bytes<K>()],
-    struct private_key<K, L> *priv, const uint8_t entropy[MLDSA_SEED_BYTES]) {
-  // Intermediate values, allocated on the heap to allow use when there is a
-  // limited amount of stack.
-  struct values_st {
-    struct public_key<K> pub;
-    matrix<K, L> a_ntt;
-    vector<L> s1_ntt;
-    vector<K> t;
-  };
-  std::unique_ptr<values_st, DeleterFree<values_st>> values(
-      reinterpret_cast<struct values_st *>(OPENSSL_malloc(sizeof(values_st))));
-  if (values == NULL) {
-    return 0;
-  }
-
-  uint8_t augmented_entropy[MLDSA_SEED_BYTES + 2];
-  OPENSSL_memcpy(augmented_entropy, entropy, MLDSA_SEED_BYTES);
-  // The k and l parameters are appended to the seed.
-  augmented_entropy[MLDSA_SEED_BYTES] = K;
-  augmented_entropy[MLDSA_SEED_BYTES + 1] = L;
-  uint8_t expanded_seed[kRhoBytes + kSigmaBytes + kKBytes];
-  BORINGSSL_keccak(expanded_seed, sizeof(expanded_seed), augmented_entropy,
-                   sizeof(augmented_entropy), boringssl_shake256);
-  const uint8_t *const rho = expanded_seed;
-  const uint8_t *const sigma = expanded_seed + kRhoBytes;
-  const uint8_t *const k = expanded_seed + kRhoBytes + kSigmaBytes;
-  // rho is public.
-  CONSTTIME_DECLASSIFY(rho, kRhoBytes);
-  OPENSSL_memcpy(values->pub.rho, rho, sizeof(values->pub.rho));
-  OPENSSL_memcpy(priv->rho, rho, sizeof(priv->rho));
-  OPENSSL_memcpy(priv->k, k, sizeof(priv->k));
-
-  matrix_expand(&values->a_ntt, rho);
-  vector_expand_short(&priv->s1, &priv->s2, sigma);
-
-  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
-  vector_ntt(&values->s1_ntt);
-
-  matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt);
-  vector_inverse_ntt(&values->t);
-  vector_add(&values->t, &values->t, &priv->s2);
-
-  vector_power2_round(&values->pub.t1, &priv->t0, &values->t);
-  // t1 is public.
-  CONSTTIME_DECLASSIFY(&values->pub.t1, sizeof(values->pub.t1));
-
-  CBB cbb;
-  CBB_init_fixed(&cbb, out_encoded_public_key, public_key_bytes<K>());
-  if (!mldsa_marshal_public_key(&cbb, &values->pub)) {
-    return 0;
-  }
-  assert(CBB_len(&cbb) == public_key_bytes<K>());
-
-  BORINGSSL_keccak(priv->public_key_hash, sizeof(priv->public_key_hash),
-                   out_encoded_public_key, public_key_bytes<K>(),
-                   boringssl_shake256);
-
-  return 1;
-}
-
-template <int K, int L>
-static int mldsa_public_from_private(struct public_key<K> *pub,
-                                     const struct private_key<K, L> *priv) {
-  // Intermediate values, allocated on the heap to allow use when there is a
-  // limited amount of stack.
-  struct values_st {
-    matrix<K, L> a_ntt;
-    vector<L> s1_ntt;
-    vector<K> t;
-    vector<K> t0;
-  };
-  std::unique_ptr<values_st, DeleterFree<values_st>> values(
-      reinterpret_cast<struct values_st *>(OPENSSL_malloc(sizeof(values_st))));
-  if (values == NULL) {
-    return 0;
-  }
-
-
-  OPENSSL_memcpy(pub->rho, priv->rho, sizeof(pub->rho));
-  OPENSSL_memcpy(pub->public_key_hash, priv->public_key_hash,
-                 sizeof(pub->public_key_hash));
-
-  matrix_expand(&values->a_ntt, priv->rho);
-
-  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
-  vector_ntt(&values->s1_ntt);
-
-  matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt);
-  vector_inverse_ntt(&values->t);
-  vector_add(&values->t, &values->t, &priv->s2);
-
-  vector_power2_round(&pub->t1, &values->t0, &values->t);
-  return 1;
-}
-
-// FIPS 204, Algorithm 7 (`ML-DSA.Sign_internal`). Returns 1 on success and 0
-// on failure.
-template <int K, int L>
-static int mldsa_sign_internal(
-    uint8_t out_encoded_signature[signature_bytes<K>()],
-    const struct private_key<K, L> *priv, const uint8_t *msg, size_t msg_len,
-    const uint8_t *context_prefix, size_t context_prefix_len,
-    const uint8_t *context, size_t context_len,
-    const uint8_t randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES]) {
-  uint8_t mu[kMuBytes];
-  struct BORINGSSL_keccak_st keccak_ctx;
-  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
-  BORINGSSL_keccak_absorb(&keccak_ctx, priv->public_key_hash,
-                          sizeof(priv->public_key_hash));
-  BORINGSSL_keccak_absorb(&keccak_ctx, context_prefix, context_prefix_len);
-  BORINGSSL_keccak_absorb(&keccak_ctx, context, context_len);
-  BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len);
-  BORINGSSL_keccak_squeeze(&keccak_ctx, mu, kMuBytes);
-
-  uint8_t rho_prime[kRhoPrimeBytes];
-  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
-  BORINGSSL_keccak_absorb(&keccak_ctx, priv->k, sizeof(priv->k));
-  BORINGSSL_keccak_absorb(&keccak_ctx, randomizer,
-                          MLDSA_SIGNATURE_RANDOMIZER_BYTES);
-  BORINGSSL_keccak_absorb(&keccak_ctx, mu, kMuBytes);
-  BORINGSSL_keccak_squeeze(&keccak_ctx, rho_prime, kRhoPrimeBytes);
-
-  // Intermediate values, allocated on the heap to allow use when there is a
-  // limited amount of stack.
-  struct values_st {
-    struct signature<K, L> sign;
-    vector<L> s1_ntt;
-    vector<K> s2_ntt;
-    vector<K> t0_ntt;
-    matrix<K, L> a_ntt;
-    vector<L> y;
-    vector<K> w;
-    vector<K> w1;
-    vector<L> cs1;
-    vector<K> cs2;
-  };
-  std::unique_ptr<values_st, DeleterFree<values_st>> values(
-      reinterpret_cast<struct values_st *>(OPENSSL_malloc(sizeof(values_st))));
-  if (values == NULL) {
-    return 0;
-  }
-  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
-  vector_ntt(&values->s1_ntt);
-
-  OPENSSL_memcpy(&values->s2_ntt, &priv->s2, sizeof(values->s2_ntt));
-  vector_ntt(&values->s2_ntt);
-
-  OPENSSL_memcpy(&values->t0_ntt, &priv->t0, sizeof(values->t0_ntt));
-  vector_ntt(&values->t0_ntt);
-
-  matrix_expand(&values->a_ntt, priv->rho);
-
-  // kappa must not exceed 2**16/L = 13107. But the probability of it
-  // exceeding even 1000 iterations is vanishingly small.
-  for (size_t kappa = 0;; kappa += L) {
-    vector_expand_mask(&values->y, rho_prime, kappa);
-
-    vector<L> *y_ntt = &values->cs1;
-    OPENSSL_memcpy(y_ntt, &values->y, sizeof(*y_ntt));
-    vector_ntt(y_ntt);
-
-    matrix_mult(&values->w, &values->a_ntt, y_ntt);
-    vector_inverse_ntt(&values->w);
-
-    vector_high_bits(&values->w1, &values->w);
-    uint8_t w1_encoded[128 * K];
-    w1_encode(w1_encoded, &values->w1);
-
-    BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
-    BORINGSSL_keccak_absorb(&keccak_ctx, mu, kMuBytes);
-    BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K);
-    BORINGSSL_keccak_squeeze(&keccak_ctx, values->sign.c_tilde,
-                             2 * lambda_bytes<K>());
-
-    scalar c_ntt;
-    scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde,
-                                  sizeof(values->sign.c_tilde), tau<K>());
-    scalar_ntt(&c_ntt);
-
-    vector_mult_scalar(&values->cs1, &values->s1_ntt, &c_ntt);
-    vector_inverse_ntt(&values->cs1);
-    vector_mult_scalar(&values->cs2, &values->s2_ntt, &c_ntt);
-    vector_inverse_ntt(&values->cs2);
-
-    vector_add(&values->sign.z, &values->y, &values->cs1);
-
-    vector<K> *r0 = &values->w1;
-    vector_sub(r0, &values->w, &values->cs2);
-    vector_low_bits(r0, r0);
-
-    // Leaking the fact that a signature was rejected is fine as the next
-    // attempt at a signature will be (indistinguishable from) independent of
-    // this one. Note, however, that we additionally leak which of the two
-    // branches rejected the signature. Section 5.5 of
-    // https://pq-crystals.org/dilithium/data/dilithium-specification-round3.pdf
-    // describes this leak as OK. Note we leak less than what is described by
-    // the paper; we do not reveal which coefficient violated the bound, and
-    // we hide which of the |z_max| or |r0_max| bound failed. See also
-    // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/2bbab0fa_d241d35a/
-    uint32_t z_max = vector_max(&values->sign.z);
-    uint32_t r0_max = vector_max_signed(r0);
-    if (constant_time_declassify_w(
-            constant_time_ge_w(z_max, gamma1<K>() - beta<K>()) |
-            constant_time_ge_w(r0_max, kGamma2 - beta<K>()))) {
-      continue;
-    }
-
-    vector<K> *ct0 = &values->w1;
-    vector_mult_scalar(ct0, &values->t0_ntt, &c_ntt);
-    vector_inverse_ntt(ct0);
-    vector_make_hint(&values->sign.h, ct0, &values->cs2, &values->w);
-
-    // See above.
-    uint32_t ct0_max = vector_max(ct0);
-    size_t h_ones = vector_count_ones(&values->sign.h);
-    if (constant_time_declassify_w(constant_time_ge_w(ct0_max, kGamma2) |
-                                   constant_time_lt_w(omega<K>(), h_ones))) {
-      continue;
-    }
-
-    // Although computed with the private key, the signature is public.
-    CONSTTIME_DECLASSIFY(values->sign.c_tilde, sizeof(values->sign.c_tilde));
-    CONSTTIME_DECLASSIFY(&values->sign.z, sizeof(values->sign.z));
-    CONSTTIME_DECLASSIFY(&values->sign.h, sizeof(values->sign.h));
-
-    CBB cbb;
-    CBB_init_fixed(&cbb, out_encoded_signature, signature_bytes<K>());
-    if (!mldsa_marshal_signature(&cbb, &values->sign)) {
-      return 0;
-    }
-
-    BSSL_CHECK(CBB_len(&cbb) == signature_bytes<K>());
-    return 1;
-  }
-}
-
-// FIPS 204, Algorithm 8 (`ML-DSA.Verify_internal`).
-template <int K, int L>
-static int mldsa_verify_internal(
-    const struct public_key<K> *pub,
-    const uint8_t encoded_signature[signature_bytes<K>()], const uint8_t *msg,
-    size_t msg_len, const uint8_t *context_prefix, size_t context_prefix_len,
-    const uint8_t *context, size_t context_len) {
-  // Intermediate values, allocated on the heap to allow use when there is a
-  // limited amount of stack.
-  struct values_st {
-    struct signature<K, L> sign;
-    matrix<K, L> a_ntt;
-    vector<L> z_ntt;
-    vector<K> az_ntt;
-    vector<K> ct1_ntt;
-  };
-  std::unique_ptr<values_st, DeleterFree<values_st>> values(
-      reinterpret_cast<struct values_st *>(OPENSSL_malloc(sizeof(values_st))));
-  if (values == NULL) {
-    return 0;
-  }
-
-  CBS cbs;
-  CBS_init(&cbs, encoded_signature, signature_bytes<K>());
-  if (!mldsa_parse_signature(&values->sign, &cbs)) {
-    return 0;
-  }
-
-  matrix_expand(&values->a_ntt, pub->rho);
-
-  uint8_t mu[kMuBytes];
-  struct BORINGSSL_keccak_st keccak_ctx;
-  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
-  BORINGSSL_keccak_absorb(&keccak_ctx, pub->public_key_hash,
-                          sizeof(pub->public_key_hash));
-  BORINGSSL_keccak_absorb(&keccak_ctx, context_prefix, context_prefix_len);
-  BORINGSSL_keccak_absorb(&keccak_ctx, context, context_len);
-  BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len);
-  BORINGSSL_keccak_squeeze(&keccak_ctx, mu, kMuBytes);
-
-  scalar c_ntt;
-  scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde,
-                                sizeof(values->sign.c_tilde), tau<K>());
-  scalar_ntt(&c_ntt);
-
-  OPENSSL_memcpy(&values->z_ntt, &values->sign.z, sizeof(values->z_ntt));
-  vector_ntt(&values->z_ntt);
-
-  matrix_mult(&values->az_ntt, &values->a_ntt, &values->z_ntt);
-
-  vector_scale_power2_round(&values->ct1_ntt, &pub->t1);
-  vector_ntt(&values->ct1_ntt);
-
-  vector_mult_scalar(&values->ct1_ntt, &values->ct1_ntt, &c_ntt);
-
-  vector<K> *const w1 = &values->az_ntt;
-  vector_sub(w1, &values->az_ntt, &values->ct1_ntt);
-  vector_inverse_ntt(w1);
-
-  vector_use_hint_vartime(w1, &values->sign.h, w1);
-  uint8_t w1_encoded[128 * K];
-  w1_encode(w1_encoded, w1);
-
-  uint8_t c_tilde[2 * lambda_bytes<K>()];
-  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
-  BORINGSSL_keccak_absorb(&keccak_ctx, mu, kMuBytes);
-  BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K);
-  BORINGSSL_keccak_squeeze(&keccak_ctx, c_tilde, 2 * lambda_bytes<K>());
-
-  uint32_t z_max = vector_max(&values->sign.z);
-  return z_max < static_cast<uint32_t>(gamma1<K>() - beta<K>()) &&
-         OPENSSL_memcmp(c_tilde, values->sign.c_tilde, 2 * lambda_bytes<K>()) ==
-             0;
-}
-
-}  // namespace
-
-// ML-DSA-65 specific wrappers.
-
-static struct private_key<6, 5> *mldsa65_private_key_from_external(
-    const struct MLDSA65_private_key *external) {
-  static_assert(sizeof(struct MLDSA65_private_key) ==
-                    sizeof(struct private_key<6, 5>),
-                "MLDSA65 private key size incorrect");
-  static_assert(alignof(struct MLDSA65_private_key) ==
-                    alignof(struct private_key<6, 5>),
-                "MLDSA65 private key align incorrect");
-  return (struct private_key<6, 5> *)external;
-}
-
-static struct public_key<6> *
-mldsa65_public_key_from_external(const struct MLDSA65_public_key *external) {
-  static_assert(sizeof(struct MLDSA65_public_key) ==
-                    sizeof(struct public_key<6>),
-                "MLDSA65 public key size incorrect");
-  static_assert(alignof(struct MLDSA65_public_key) ==
-                    alignof(struct public_key<6>),
-                "MLDSA65 public key align incorrect");
-  return (struct public_key<6> *)external;
-}
-
-int MLDSA65_parse_public_key(struct MLDSA65_public_key *public_key, CBS *in) {
-  return mldsa_parse_public_key(mldsa65_public_key_from_external(public_key),
-                                in);
-}
-
-int MLDSA65_marshal_private_key(CBB *out,
-                                const struct MLDSA65_private_key *private_key) {
-  return mldsa_marshal_private_key(
-      out, mldsa65_private_key_from_external(private_key));
-}
-
-int MLDSA65_parse_private_key(struct MLDSA65_private_key *private_key,
-                              CBS *in) {
-  return mldsa_parse_private_key(mldsa65_private_key_from_external(private_key),
-                                 in) &&
-         CBS_len(in) == 0;
-}
+static_assert(sizeof(BCM_mldsa65_private_key) == sizeof(MLDSA65_private_key),
+              "");
+static_assert(alignof(BCM_mldsa65_private_key) == alignof(MLDSA65_private_key),
+              "");
+static_assert(sizeof(BCM_mldsa65_public_key) == sizeof(MLDSA65_public_key), "");
+static_assert(alignof(BCM_mldsa65_public_key) == alignof(MLDSA65_public_key),
+              "");
+static_assert(MLDSA_SEED_BYTES == BCM_MLDSA_SEED_BYTES, "");
+static_assert(MLDSA65_PRIVATE_KEY_BYTES == BCM_MLDSA65_PRIVATE_KEY_BYTES, "");
+static_assert(MLDSA65_PUBLIC_KEY_BYTES == BCM_MLDSA65_PUBLIC_KEY_BYTES, "");
+static_assert(MLDSA65_SIGNATURE_BYTES == BCM_MLDSA65_SIGNATURE_BYTES, "");
 
-// Calls |MLDSA_generate_key_external_entropy| with random bytes from
-// |RAND_bytes|. Returns 1 on success and 0 on failure.
 int MLDSA65_generate_key(
     uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES],
     uint8_t out_seed[MLDSA_SEED_BYTES],
     struct MLDSA65_private_key *out_private_key) {
-  RAND_bytes(out_seed, MLDSA_SEED_BYTES);
-  return MLDSA65_generate_key_external_entropy(out_encoded_public_key,
-                                               out_private_key, out_seed);
+  return bcm_success(BCM_mldsa65_generate_key(
+      out_encoded_public_key, out_seed,
+      reinterpret_cast<BCM_mldsa65_private_key *>(out_private_key)));
 }
 
 int MLDSA65_private_key_from_seed(struct MLDSA65_private_key *out_private_key,
                                   const uint8_t *seed, size_t seed_len) {
-  if (seed_len != MLDSA_SEED_BYTES) {
+  if (seed_len != BCM_MLDSA_SEED_BYTES) {
     return 0;
   }
-  uint8_t public_key[MLDSA65_PUBLIC_KEY_BYTES];
-  return MLDSA65_generate_key_external_entropy(public_key, out_private_key,
-                                               seed);
-}
-
-int MLDSA65_generate_key_external_entropy(
-    uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES],
-    struct MLDSA65_private_key *out_private_key,
-    const uint8_t entropy[MLDSA_SEED_BYTES]) {
-  return mldsa_generate_key_external_entropy(
-      out_encoded_public_key,
-      mldsa65_private_key_from_external(out_private_key), entropy);
+  return bcm_success(BCM_mldsa65_private_key_from_seed(
+      reinterpret_cast<BCM_mldsa65_private_key *>(out_private_key), seed));
 }
 
 int MLDSA65_public_from_private(struct MLDSA65_public_key *out_public_key,
                                 const struct MLDSA65_private_key *private_key) {
-  return mldsa_public_from_private(
-      mldsa65_public_key_from_external(out_public_key),
-      mldsa65_private_key_from_external(private_key));
+  return bcm_success(BCM_mldsa65_public_from_private(
+      reinterpret_cast<BCM_mldsa65_public_key *>(out_public_key),
+      reinterpret_cast<const BCM_mldsa65_private_key *>(private_key)));
 }
 
-int MLDSA65_sign_internal(
-    uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES],
-    const struct MLDSA65_private_key *private_key, const uint8_t *msg,
-    size_t msg_len, const uint8_t *context_prefix, size_t context_prefix_len,
-    const uint8_t *context, size_t context_len,
-    const uint8_t randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES]) {
-  return mldsa_sign_internal(out_encoded_signature,
-                             mldsa65_private_key_from_external(private_key),
-                             msg, msg_len, context_prefix, context_prefix_len,
-                             context, context_len, randomizer);
-}
-
-// ML-DSA signature in randomized mode, filling the random bytes with
-// |RAND_bytes|. Returns 1 on success and 0 on failure.
 int MLDSA65_sign(uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES],
                  const struct MLDSA65_private_key *private_key,
                  const uint8_t *msg, size_t msg_len, const uint8_t *context,
@@ -1748,43 +60,37 @@
   if (context_len > 255) {
     return 0;
   }
-
-  uint8_t randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES];
-  RAND_bytes(randomizer, sizeof(randomizer));
-
-  const uint8_t context_prefix[2] = {0, static_cast<uint8_t>(context_len)};
-  return MLDSA65_sign_internal(out_encoded_signature, private_key, msg, msg_len,
-                               context_prefix, sizeof(context_prefix), context,
-                               context_len, randomizer);
+  return bcm_success(BCM_mldsa65_sign(
+      out_encoded_signature,
+      reinterpret_cast<const BCM_mldsa65_private_key *>(private_key), msg,
+      msg_len, context, context_len));
 }
 
-// FIPS 204, Algorithm 3 (`ML-DSA.Verify`).
 int MLDSA65_verify(const struct MLDSA65_public_key *public_key,
                    const uint8_t *signature, size_t signature_len,
                    const uint8_t *msg, size_t msg_len, const uint8_t *context,
                    size_t context_len) {
-  if (context_len > 255 || signature_len != MLDSA65_SIGNATURE_BYTES) {
+  if (context_len > 255 || signature_len != BCM_MLDSA65_SIGNATURE_BYTES) {
     return 0;
   }
-
-  const uint8_t context_prefix[2] = {0, static_cast<uint8_t>(context_len)};
-  return MLDSA65_verify_internal(public_key, signature, msg, msg_len,
-                                 context_prefix, sizeof(context_prefix),
-                                 context, context_len);
-}
-
-int MLDSA65_verify_internal(
-    const struct MLDSA65_public_key *public_key,
-    const uint8_t encoded_signature[MLDSA65_SIGNATURE_BYTES],
-    const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix,
-    size_t context_prefix_len, const uint8_t *context, size_t context_len) {
-  return mldsa_verify_internal<6, 5>(
-      mldsa65_public_key_from_external(public_key), encoded_signature, msg,
-      msg_len, context_prefix, context_prefix_len, context, context_len);
+  return bcm_success(BCM_mldsa65_verify(
+      reinterpret_cast<const BCM_mldsa65_public_key *>(public_key), signature,
+      msg, msg_len, context, context_len));
 }
 
 int MLDSA65_marshal_public_key(CBB *out,
                                const struct MLDSA65_public_key *public_key) {
-  return mldsa_marshal_public_key(out,
-                                  mldsa65_public_key_from_external(public_key));
+  return bcm_success(BCM_mldsa65_marshal_public_key(
+      out, reinterpret_cast<const BCM_mldsa65_public_key *>(public_key)));
+}
+
+int MLDSA65_parse_public_key(struct MLDSA65_public_key *public_key, CBS *in) {
+  return bcm_success(BCM_mldsa65_parse_public_key(
+      reinterpret_cast<BCM_mldsa65_public_key *>(public_key), in));
+}
+
+int MLDSA65_parse_private_key(struct MLDSA65_private_key *private_key,
+                              CBS *in) {
+  return bcm_success(BCM_mldsa65_parse_private_key(
+      reinterpret_cast<BCM_mldsa65_private_key *>(private_key), in));
 }
diff --git a/crypto/mldsa/mldsa_test.cc b/crypto/mldsa/mldsa_test.cc
index 5737f54..67e78af 100644
--- a/crypto/mldsa/mldsa_test.cc
+++ b/crypto/mldsa/mldsa_test.cc
@@ -23,21 +23,21 @@
 #include <openssl/mem.h>
 #include <openssl/span.h>
 
+#include "../fipsmodule/bcm_interface.h"
 #include "../test/file_test.h"
 #include "../test/test_util.h"
-#include "./internal.h"
 
 
 namespace {
 
 template <typename T>
-std::vector<uint8_t> Marshal(int (*marshal_func)(CBB *, const T *),
+std::vector<uint8_t> Marshal(bcm_status_t (*marshal_func)(CBB *, const T *),
                              const T *t) {
   bssl::ScopedCBB cbb;
   uint8_t *encoded;
   size_t encoded_len;
-  if (!CBB_init(cbb.get(), 1) ||      //
-      !marshal_func(cbb.get(), t) ||  //
+  if (!CBB_init(cbb.get(), 1) ||                             //
+      marshal_func(cbb.get(), t) != bcm_status::approved ||  //
       !CBB_finish(cbb.get(), &encoded, &encoded_len)) {
     abort();
   }
@@ -110,8 +110,11 @@
   auto priv2 = std::make_unique<MLDSA65_private_key>();
   EXPECT_TRUE(MLDSA65_private_key_from_seed(priv2.get(), seed, sizeof(seed)));
 
-  EXPECT_EQ(Bytes(Marshal(MLDSA65_marshal_private_key, priv.get())),
-            Bytes(Marshal(MLDSA65_marshal_private_key, priv2.get())));
+  EXPECT_EQ(
+      Bytes(Marshal(BCM_mldsa65_marshal_private_key,
+                    reinterpret_cast<BCM_mldsa65_private_key *>(priv.get()))),
+      Bytes(Marshal(BCM_mldsa65_marshal_private_key,
+                    reinterpret_cast<BCM_mldsa65_private_key *>(priv2.get()))));
 }
 
 TEST(MLDSATest, SignatureIsRandomized) {
@@ -199,7 +202,8 @@
   CBB cbb;
   std::vector<uint8_t> malformed_private_key(MLDSA65_PRIVATE_KEY_BYTES + 1, 0);
   CBB_init_fixed(&cbb, malformed_private_key.data(), MLDSA65_PRIVATE_KEY_BYTES);
-  ASSERT_TRUE(MLDSA65_marshal_private_key(&cbb, priv.get()));
+  ASSERT_TRUE(bcm_success(BCM_mldsa65_marshal_private_key(
+      &cbb, reinterpret_cast<BCM_mldsa65_private_key *>(priv.get()))));
 
   CBS cbs;
   auto parsed_priv = std::make_unique<MLDSA65_private_key>();
@@ -228,18 +232,19 @@
   CBS_init(&cbs, private_key_bytes.data(), private_key_bytes.size());
   EXPECT_TRUE(MLDSA65_parse_private_key(priv.get(), &cbs));
 
-  const uint8_t zero_randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES] = {0};
+  const uint8_t zero_randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES] = {0};
   std::vector<uint8_t> signature(MLDSA65_SIGNATURE_BYTES);
-  EXPECT_TRUE(MLDSA65_sign_internal(signature.data(), priv.get(), msg.data(),
-                                    msg.size(), nullptr, 0, nullptr, 0,
-                                    zero_randomizer));
+  EXPECT_TRUE(bcm_success(BCM_mldsa65_sign_internal(
+      signature.data(), reinterpret_cast<BCM_mldsa65_private_key *>(priv.get()),
+      msg.data(), msg.size(), nullptr, 0, nullptr, 0, zero_randomizer)));
 
   EXPECT_EQ(Bytes(signature), Bytes(expected_signature));
 
   auto pub = std::make_unique<MLDSA65_public_key>();
   ASSERT_TRUE(MLDSA65_public_from_private(pub.get(), priv.get()));
-  EXPECT_TRUE(MLDSA65_verify_internal(pub.get(), signature.data(), msg.data(),
-                                      msg.size(), nullptr, 0, nullptr, 0));
+  EXPECT_TRUE(bcm_success(BCM_mldsa65_verify_internal(
+      reinterpret_cast<BCM_mldsa65_public_key *>(pub.get()), signature.data(),
+      msg.data(), msg.size(), nullptr, 0, nullptr, 0)));
 }
 
 TEST(MLDSATest, SigGenTests) {
@@ -254,8 +259,9 @@
 
   std::vector<uint8_t> encoded_public_key(MLDSA65_PUBLIC_KEY_BYTES);
   auto priv = std::make_unique<MLDSA65_private_key>();
-  ASSERT_TRUE(MLDSA65_generate_key_external_entropy(encoded_public_key.data(),
-                                                    priv.get(), seed.data()));
+  ASSERT_TRUE(bcm_success(BCM_mldsa65_generate_key_external_entropy(
+      encoded_public_key.data(),
+      reinterpret_cast<BCM_mldsa65_private_key *>(priv.get()), seed.data())));
 
   EXPECT_EQ(Bytes(encoded_public_key), Bytes(expected_public_key));
 }
@@ -265,10 +271,11 @@
 }
 
 template <typename PrivateKey, int (*ParsePrivateKey)(PrivateKey *, CBS *),
-          size_t SignatureBytes,
-          int (*SignInternal)(uint8_t *, const PrivateKey *, const uint8_t *,
-                              size_t, const uint8_t *, size_t, const uint8_t *,
-                              size_t, const uint8_t *)>
+          size_t SignatureBytes, typename BcmPrivateKey,
+          bcm_status_t (*SignInternal)(uint8_t *, const BcmPrivateKey *,
+                                       const uint8_t *, size_t, const uint8_t *,
+                                       size_t, const uint8_t *, size_t,
+                                       const uint8_t *)>
 static void MLDSAWycheproofSignTest(FileTest *t) {
   std::vector<uint8_t> private_key_bytes, msg, expected_signature, context;
   ASSERT_TRUE(t->GetInstructionBytes(&private_key_bytes, "privateKey"));
@@ -299,12 +306,13 @@
     return;
   }
 
-  const uint8_t zero_randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES] = {0};
+  const uint8_t zero_randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES] = {0};
   std::vector<uint8_t> signature(SignatureBytes);
   const uint8_t context_prefix[2] = {0, static_cast<uint8_t>(context.size())};
-  EXPECT_TRUE(SignInternal(signature.data(), priv.get(), msg.data(), msg.size(),
-                           context_prefix, sizeof(context_prefix),
-                           context.data(), context.size(), zero_randomizer));
+  EXPECT_TRUE(bcm_success(SignInternal(
+      signature.data(), reinterpret_cast<BcmPrivateKey *>(priv.get()),
+      msg.data(), msg.size(), context_prefix, sizeof(context_prefix),
+      context.data(), context.size(), zero_randomizer)));
 
   EXPECT_EQ(Bytes(signature), Bytes(expected_signature));
 }
@@ -313,7 +321,8 @@
   FileTestGTest(
       "third_party/wycheproof_testvectors/mldsa_65_standard_sign_test.txt",
       MLDSAWycheproofSignTest<MLDSA65_private_key, MLDSA65_parse_private_key,
-                              MLDSA65_SIGNATURE_BYTES, MLDSA65_sign_internal>);
+                              MLDSA65_SIGNATURE_BYTES, BCM_mldsa65_private_key,
+                              BCM_mldsa65_sign_internal>);
 }
 
 template <typename PublicKey, int (*ParsePublicKey)(PublicKey *, CBS *),
diff --git a/crypto/mlkem/mlkem.cc b/crypto/mlkem/mlkem.cc
index 5ba4765..1636eca 100644
--- a/crypto/mlkem/mlkem.cc
+++ b/crypto/mlkem/mlkem.cc
@@ -24,8 +24,8 @@
 #include <openssl/mem.h>
 #include <openssl/rand.h>
 
+#include "../fipsmodule/keccak/internal.h"
 #include "../internal.h"
-#include "../keccak/internal.h"
 #include "./internal.h"
 
 
diff --git a/crypto/mlkem/mlkem_test.cc b/crypto/mlkem/mlkem_test.cc
index 5b3371d..2516302 100644
--- a/crypto/mlkem/mlkem_test.cc
+++ b/crypto/mlkem/mlkem_test.cc
@@ -24,7 +24,7 @@
 #include <openssl/mem.h>
 #include <openssl/mlkem.h>
 
-#include "../keccak/internal.h"
+#include "../fipsmodule/keccak/internal.h"
 #include "../test/file_test.h"
 #include "../test/test_util.h"
 #include "./internal.h"
diff --git a/gen/sources.bzl b/gen/sources.bzl
index fa199e9..267487b 100644
--- a/gen/sources.bzl
+++ b/gen/sources.bzl
@@ -71,6 +71,8 @@
     "crypto/fipsmodule/ecdsa/ecdsa.cc.inc",
     "crypto/fipsmodule/hkdf/hkdf.cc.inc",
     "crypto/fipsmodule/hmac/hmac.cc.inc",
+    "crypto/fipsmodule/keccak/keccak.cc.inc",
+    "crypto/fipsmodule/mldsa/mldsa.cc.inc",
     "crypto/fipsmodule/modes/cbc.cc.inc",
     "crypto/fipsmodule/modes/cfb.cc.inc",
     "crypto/fipsmodule/modes/ctr.cc.inc",
@@ -357,7 +359,6 @@
     "crypto/fipsmodule/fips_shared_support.cc",
     "crypto/hpke/hpke.cc",
     "crypto/hrss/hrss.cc",
-    "crypto/keccak/keccak.cc",
     "crypto/kyber/kyber.cc",
     "crypto/lhash/lhash.cc",
     "crypto/md4/md4.cc",
@@ -611,6 +612,7 @@
     "crypto/fipsmodule/ec/p256-nistz.h",
     "crypto/fipsmodule/ec/p256_table.h",
     "crypto/fipsmodule/ecdsa/internal.h",
+    "crypto/fipsmodule/keccak/internal.h",
     "crypto/fipsmodule/modes/internal.h",
     "crypto/fipsmodule/rand/internal.h",
     "crypto/fipsmodule/rsa/internal.h",
@@ -619,11 +621,9 @@
     "crypto/fipsmodule/tls/internal.h",
     "crypto/hrss/internal.h",
     "crypto/internal.h",
-    "crypto/keccak/internal.h",
     "crypto/kyber/internal.h",
     "crypto/lhash/internal.h",
     "crypto/md5/internal.h",
-    "crypto/mldsa/internal.h",
     "crypto/mlkem/internal.h",
     "crypto/obj/obj_dat.h",
     "crypto/pkcs7/internal.h",
@@ -724,6 +724,7 @@
     "crypto/fipsmodule/ec/p256_test.cc",
     "crypto/fipsmodule/ecdsa/ecdsa_test.cc",
     "crypto/fipsmodule/hkdf/hkdf_test.cc",
+    "crypto/fipsmodule/keccak/keccak_test.cc",
     "crypto/fipsmodule/modes/gcm_test.cc",
     "crypto/fipsmodule/rand/ctrdrbg_test.cc",
     "crypto/fipsmodule/service_indicator/service_indicator_test.cc",
@@ -732,7 +733,6 @@
     "crypto/hpke/hpke_test.cc",
     "crypto/hrss/hrss_test.cc",
     "crypto/impl_dispatch_test.cc",
-    "crypto/keccak/keccak_test.cc",
     "crypto/kyber/kyber_test.cc",
     "crypto/lhash/lhash_test.cc",
     "crypto/md5/md5_test.cc",
@@ -819,10 +819,10 @@
     "crypto/fipsmodule/ec/p256-nistz_tests.txt",
     "crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt",
     "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt",
+    "crypto/fipsmodule/keccak/keccak_tests.txt",
     "crypto/fipsmodule/rand/ctrdrbg_vectors.txt",
     "crypto/hmac_extra/hmac_tests.txt",
     "crypto/hpke/hpke_test_vectors.txt",
-    "crypto/keccak/keccak_tests.txt",
     "crypto/kyber/kyber_tests.txt",
     "crypto/mldsa/mldsa_nist_keygen_tests.txt",
     "crypto/mldsa/mldsa_nist_siggen_tests.txt",
diff --git a/gen/sources.cmake b/gen/sources.cmake
index 77010b8..7941a4a 100644
--- a/gen/sources.cmake
+++ b/gen/sources.cmake
@@ -75,6 +75,8 @@
   crypto/fipsmodule/ecdsa/ecdsa.cc.inc
   crypto/fipsmodule/hkdf/hkdf.cc.inc
   crypto/fipsmodule/hmac/hmac.cc.inc
+  crypto/fipsmodule/keccak/keccak.cc.inc
+  crypto/fipsmodule/mldsa/mldsa.cc.inc
   crypto/fipsmodule/modes/cbc.cc.inc
   crypto/fipsmodule/modes/cfb.cc.inc
   crypto/fipsmodule/modes/ctr.cc.inc
@@ -371,7 +373,6 @@
   crypto/fipsmodule/fips_shared_support.cc
   crypto/hpke/hpke.cc
   crypto/hrss/hrss.cc
-  crypto/keccak/keccak.cc
   crypto/kyber/kyber.cc
   crypto/lhash/lhash.cc
   crypto/md4/md4.cc
@@ -629,6 +630,7 @@
   crypto/fipsmodule/ec/p256-nistz.h
   crypto/fipsmodule/ec/p256_table.h
   crypto/fipsmodule/ecdsa/internal.h
+  crypto/fipsmodule/keccak/internal.h
   crypto/fipsmodule/modes/internal.h
   crypto/fipsmodule/rand/internal.h
   crypto/fipsmodule/rsa/internal.h
@@ -637,11 +639,9 @@
   crypto/fipsmodule/tls/internal.h
   crypto/hrss/internal.h
   crypto/internal.h
-  crypto/keccak/internal.h
   crypto/kyber/internal.h
   crypto/lhash/internal.h
   crypto/md5/internal.h
-  crypto/mldsa/internal.h
   crypto/mlkem/internal.h
   crypto/obj/obj_dat.h
   crypto/pkcs7/internal.h
@@ -748,6 +748,7 @@
   crypto/fipsmodule/ec/p256_test.cc
   crypto/fipsmodule/ecdsa/ecdsa_test.cc
   crypto/fipsmodule/hkdf/hkdf_test.cc
+  crypto/fipsmodule/keccak/keccak_test.cc
   crypto/fipsmodule/modes/gcm_test.cc
   crypto/fipsmodule/rand/ctrdrbg_test.cc
   crypto/fipsmodule/service_indicator/service_indicator_test.cc
@@ -756,7 +757,6 @@
   crypto/hpke/hpke_test.cc
   crypto/hrss/hrss_test.cc
   crypto/impl_dispatch_test.cc
-  crypto/keccak/keccak_test.cc
   crypto/kyber/kyber_test.cc
   crypto/lhash/lhash_test.cc
   crypto/md5/md5_test.cc
@@ -845,10 +845,10 @@
   crypto/fipsmodule/ec/p256-nistz_tests.txt
   crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt
   crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt
+  crypto/fipsmodule/keccak/keccak_tests.txt
   crypto/fipsmodule/rand/ctrdrbg_vectors.txt
   crypto/hmac_extra/hmac_tests.txt
   crypto/hpke/hpke_test_vectors.txt
-  crypto/keccak/keccak_tests.txt
   crypto/kyber/kyber_tests.txt
   crypto/mldsa/mldsa_nist_keygen_tests.txt
   crypto/mldsa/mldsa_nist_siggen_tests.txt
diff --git a/gen/sources.gni b/gen/sources.gni
index 3af7dfc..45ea9f4 100644
--- a/gen/sources.gni
+++ b/gen/sources.gni
@@ -71,6 +71,8 @@
   "crypto/fipsmodule/ecdsa/ecdsa.cc.inc",
   "crypto/fipsmodule/hkdf/hkdf.cc.inc",
   "crypto/fipsmodule/hmac/hmac.cc.inc",
+  "crypto/fipsmodule/keccak/keccak.cc.inc",
+  "crypto/fipsmodule/mldsa/mldsa.cc.inc",
   "crypto/fipsmodule/modes/cbc.cc.inc",
   "crypto/fipsmodule/modes/cfb.cc.inc",
   "crypto/fipsmodule/modes/ctr.cc.inc",
@@ -357,7 +359,6 @@
   "crypto/fipsmodule/fips_shared_support.cc",
   "crypto/hpke/hpke.cc",
   "crypto/hrss/hrss.cc",
-  "crypto/keccak/keccak.cc",
   "crypto/kyber/kyber.cc",
   "crypto/lhash/lhash.cc",
   "crypto/md4/md4.cc",
@@ -611,6 +612,7 @@
   "crypto/fipsmodule/ec/p256-nistz.h",
   "crypto/fipsmodule/ec/p256_table.h",
   "crypto/fipsmodule/ecdsa/internal.h",
+  "crypto/fipsmodule/keccak/internal.h",
   "crypto/fipsmodule/modes/internal.h",
   "crypto/fipsmodule/rand/internal.h",
   "crypto/fipsmodule/rsa/internal.h",
@@ -619,11 +621,9 @@
   "crypto/fipsmodule/tls/internal.h",
   "crypto/hrss/internal.h",
   "crypto/internal.h",
-  "crypto/keccak/internal.h",
   "crypto/kyber/internal.h",
   "crypto/lhash/internal.h",
   "crypto/md5/internal.h",
-  "crypto/mldsa/internal.h",
   "crypto/mlkem/internal.h",
   "crypto/obj/obj_dat.h",
   "crypto/pkcs7/internal.h",
@@ -724,6 +724,7 @@
   "crypto/fipsmodule/ec/p256_test.cc",
   "crypto/fipsmodule/ecdsa/ecdsa_test.cc",
   "crypto/fipsmodule/hkdf/hkdf_test.cc",
+  "crypto/fipsmodule/keccak/keccak_test.cc",
   "crypto/fipsmodule/modes/gcm_test.cc",
   "crypto/fipsmodule/rand/ctrdrbg_test.cc",
   "crypto/fipsmodule/service_indicator/service_indicator_test.cc",
@@ -732,7 +733,6 @@
   "crypto/hpke/hpke_test.cc",
   "crypto/hrss/hrss_test.cc",
   "crypto/impl_dispatch_test.cc",
-  "crypto/keccak/keccak_test.cc",
   "crypto/kyber/kyber_test.cc",
   "crypto/lhash/lhash_test.cc",
   "crypto/md5/md5_test.cc",
@@ -819,10 +819,10 @@
   "crypto/fipsmodule/ec/p256-nistz_tests.txt",
   "crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt",
   "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt",
+  "crypto/fipsmodule/keccak/keccak_tests.txt",
   "crypto/fipsmodule/rand/ctrdrbg_vectors.txt",
   "crypto/hmac_extra/hmac_tests.txt",
   "crypto/hpke/hpke_test_vectors.txt",
-  "crypto/keccak/keccak_tests.txt",
   "crypto/kyber/kyber_tests.txt",
   "crypto/mldsa/mldsa_nist_keygen_tests.txt",
   "crypto/mldsa/mldsa_nist_siggen_tests.txt",
diff --git a/gen/sources.json b/gen/sources.json
index 6afbc27..589126a 100644
--- a/gen/sources.json
+++ b/gen/sources.json
@@ -56,6 +56,8 @@
       "crypto/fipsmodule/ecdsa/ecdsa.cc.inc",
       "crypto/fipsmodule/hkdf/hkdf.cc.inc",
       "crypto/fipsmodule/hmac/hmac.cc.inc",
+      "crypto/fipsmodule/keccak/keccak.cc.inc",
+      "crypto/fipsmodule/mldsa/mldsa.cc.inc",
       "crypto/fipsmodule/modes/cbc.cc.inc",
       "crypto/fipsmodule/modes/cfb.cc.inc",
       "crypto/fipsmodule/modes/ctr.cc.inc",
@@ -341,7 +343,6 @@
       "crypto/fipsmodule/fips_shared_support.cc",
       "crypto/hpke/hpke.cc",
       "crypto/hrss/hrss.cc",
-      "crypto/keccak/keccak.cc",
       "crypto/kyber/kyber.cc",
       "crypto/lhash/lhash.cc",
       "crypto/md4/md4.cc",
@@ -593,6 +594,7 @@
       "crypto/fipsmodule/ec/p256-nistz.h",
       "crypto/fipsmodule/ec/p256_table.h",
       "crypto/fipsmodule/ecdsa/internal.h",
+      "crypto/fipsmodule/keccak/internal.h",
       "crypto/fipsmodule/modes/internal.h",
       "crypto/fipsmodule/rand/internal.h",
       "crypto/fipsmodule/rsa/internal.h",
@@ -601,11 +603,9 @@
       "crypto/fipsmodule/tls/internal.h",
       "crypto/hrss/internal.h",
       "crypto/internal.h",
-      "crypto/keccak/internal.h",
       "crypto/kyber/internal.h",
       "crypto/lhash/internal.h",
       "crypto/md5/internal.h",
-      "crypto/mldsa/internal.h",
       "crypto/mlkem/internal.h",
       "crypto/obj/obj_dat.h",
       "crypto/pkcs7/internal.h",
@@ -705,6 +705,7 @@
       "crypto/fipsmodule/ec/p256_test.cc",
       "crypto/fipsmodule/ecdsa/ecdsa_test.cc",
       "crypto/fipsmodule/hkdf/hkdf_test.cc",
+      "crypto/fipsmodule/keccak/keccak_test.cc",
       "crypto/fipsmodule/modes/gcm_test.cc",
       "crypto/fipsmodule/rand/ctrdrbg_test.cc",
       "crypto/fipsmodule/service_indicator/service_indicator_test.cc",
@@ -713,7 +714,6 @@
       "crypto/hpke/hpke_test.cc",
       "crypto/hrss/hrss_test.cc",
       "crypto/impl_dispatch_test.cc",
-      "crypto/keccak/keccak_test.cc",
       "crypto/kyber/kyber_test.cc",
       "crypto/lhash/lhash_test.cc",
       "crypto/md5/md5_test.cc",
@@ -799,10 +799,10 @@
       "crypto/fipsmodule/ec/p256-nistz_tests.txt",
       "crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt",
       "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt",
+      "crypto/fipsmodule/keccak/keccak_tests.txt",
       "crypto/fipsmodule/rand/ctrdrbg_vectors.txt",
       "crypto/hmac_extra/hmac_tests.txt",
       "crypto/hpke/hpke_test_vectors.txt",
-      "crypto/keccak/keccak_tests.txt",
       "crypto/kyber/kyber_tests.txt",
       "crypto/mldsa/mldsa_nist_keygen_tests.txt",
       "crypto/mldsa/mldsa_nist_siggen_tests.txt",
diff --git a/tool/speed.cc b/tool/speed.cc
index d641d20..ae665a7 100644
--- a/tool/speed.cc
+++ b/tool/speed.cc
@@ -70,7 +70,6 @@
 #include "../crypto/ec_extra/internal.h"
 #include "../crypto/fipsmodule/ec/internal.h"
 #include "../crypto/internal.h"
-#include "../crypto/mldsa/internal.h"
 #include "../crypto/trust_token/internal.h"
 #include "internal.h"
 
@@ -1155,27 +1154,6 @@
 
   results.Print("MLDSA key generation");
 
-  auto encoded_private_key =
-      std::make_unique<uint8_t[]>(MLDSA65_PRIVATE_KEY_BYTES);
-  CBB cbb;
-  CBB_init_fixed(&cbb, encoded_private_key.get(), MLDSA65_PRIVATE_KEY_BYTES);
-  MLDSA65_marshal_private_key(&cbb, priv.get());
-
-  if (!TimeFunctionParallel(&results, [&]() -> bool {
-        CBS cbs;
-        CBS_init(&cbs, encoded_private_key.get(), MLDSA65_PRIVATE_KEY_BYTES);
-        if (!MLDSA65_parse_private_key(priv.get(), &cbs)) {
-          fprintf(stderr, "Failure in MLDSA65_parse_private_key.\n");
-          return false;
-        }
-        return true;
-      })) {
-    fprintf(stderr, "Failed to time MLDSA65_parse_private_key.\n");
-    return false;
-  }
-
-  results.Print("MLDSA parse (valid) private key");
-
   const char *message = "Hello world";
   size_t message_len = strlen(message);
   auto out_encoded_signature =