ML-KEM benchmarks: add per-operation microbenchmarks.

Retaining the benchmarks for complete TLS server and client side though.

Also, change a tiny bug in the ParseEncap benchmark that made its timing
public key dependent, adding more noise to the output than necessary.

Bug: 503700354
Change-Id: I9e2da2d0c25b71c0cb325713f63e04d56a6a6964
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/93088
Reviewed-by: Xiangfei Ding <xfding@google.com>
Presubmit-BoringSSL-Verified: boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com>
diff --git a/bench/mlkem.cc b/bench/mlkem.cc
index ef1c6dd..8ea49f1 100644
--- a/bench/mlkem.cc
+++ b/bench/mlkem.cc
@@ -25,6 +25,8 @@
 #include "./internal.h"
 
 namespace {
+
+// generate_key + decap (same as TLS server side)
 void BM_SpeedMLKEM768KeyGenDecap(benchmark::State &state) {
   uint8_t ciphertext[MLKEM768_CIPHERTEXT_BYTES];
   // This ciphertext is nonsense, but decap is constant-time so, for the
@@ -44,14 +46,19 @@
   }
 }
 
+// parse + encap (same as TLS client side)
 void BM_SpeedMLKEM768ParseEncacp(benchmark::State &state) {
-  uint8_t ciphertext[MLKEM768_CIPHERTEXT_BYTES];
-  MLKEM768_private_key priv;
-  uint8_t encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES];
-  MLKEM768_generate_key(encoded_public_key, nullptr, &priv);
-  MLKEM768_public_key pub;
-
   for (auto _ : state) {
+    state.PauseTiming();
+    uint8_t encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES];
+    {
+      MLKEM768_private_key priv;
+      MLKEM768_generate_key(encoded_public_key, nullptr, &priv);
+    }
+    benchmark::DoNotOptimize(encoded_public_key);
+    state.ResumeTiming();
+
+    MLKEM768_public_key pub;
     CBS encoded_public_key_cbs;
     CBS_init(&encoded_public_key_cbs, encoded_public_key,
              sizeof(encoded_public_key));
@@ -60,13 +67,14 @@
       return;
     }
     uint8_t shared_secret[MLKEM_SHARED_SECRET_BYTES];
+    uint8_t ciphertext[MLKEM768_CIPHERTEXT_BYTES];
     MLKEM768_encap(ciphertext, shared_secret, &pub);
     benchmark::DoNotOptimize(ciphertext);
     benchmark::DoNotOptimize(shared_secret);
   }
 }
 
-
+// generate_key + decap (same as TLS server side)
 void BM_SpeedMLKEM1024KeyGenDecap(benchmark::State &state) {
   uint8_t ciphertext[MLKEM1024_CIPHERTEXT_BYTES];
   // This ciphertext is nonsense, but decap is constant-time so, for the
@@ -87,23 +95,193 @@
   }
 }
 
+// parse + encap (same as TLS client side)
 void BM_SpeedMLKEM1024ParseEncacp(benchmark::State &state) {
-  uint8_t ciphertext[MLKEM1024_CIPHERTEXT_BYTES];
-  auto priv = std::make_unique<MLKEM1024_private_key>();
-  uint8_t encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES];
-  MLKEM1024_generate_key(encoded_public_key, nullptr, &*priv);
-  auto pub = std::make_unique<MLKEM1024_public_key>();
-
   for (auto _ : state) {
+    state.PauseTiming();
+    uint8_t encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES];
+    {
+      // On heap to avoid stack frame size limit.
+      auto priv = std::make_unique<MLKEM1024_private_key>();
+      MLKEM1024_generate_key(encoded_public_key, nullptr, &*priv);
+    }
+    benchmark::DoNotOptimize(encoded_public_key);
+    state.ResumeTiming();
+
+    MLKEM1024_public_key pub;
     CBS encoded_public_key_cbs;
     CBS_init(&encoded_public_key_cbs, encoded_public_key,
              sizeof(encoded_public_key));
-    if (!MLKEM1024_parse_public_key(&*pub, &encoded_public_key_cbs)) {
+    if (!MLKEM1024_parse_public_key(&pub, &encoded_public_key_cbs)) {
       state.SkipWithError("Failure in MLKEM1024_parse_public_key.");
       return;
     }
     uint8_t shared_secret[MLKEM_SHARED_SECRET_BYTES];
-    MLKEM1024_encap(ciphertext, shared_secret, &*pub);
+    uint8_t ciphertext[MLKEM1024_CIPHERTEXT_BYTES];
+    MLKEM1024_encap(ciphertext, shared_secret, &pub);
+    benchmark::DoNotOptimize(ciphertext);
+    benchmark::DoNotOptimize(shared_secret);
+  }
+}
+
+// Microbenchmarks follow
+
+void BM_SpeedMLKEM768KeyGenOnly(benchmark::State &state) {
+  for (auto _ : state) {
+    MLKEM768_private_key priv;
+    uint8_t encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES];
+    MLKEM768_generate_key(encoded_public_key, nullptr, &priv);
+    benchmark::DoNotOptimize(encoded_public_key);
+    benchmark::DoNotOptimize(priv);
+  }
+}
+
+void BM_SpeedMLKEM768DecapOnly(benchmark::State &state) {
+  uint8_t ciphertext[MLKEM768_CIPHERTEXT_BYTES];
+  // This ciphertext is nonsense, but decap is constant-time so, for the
+  // purposes of timing, it's fine.
+  memset(ciphertext, 42, sizeof(ciphertext));
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    MLKEM768_private_key priv;
+    {
+      uint8_t encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES];
+      MLKEM768_generate_key(encoded_public_key, nullptr, &priv);
+    }
+    benchmark::DoNotOptimize(priv);
+    state.ResumeTiming();
+
+    uint8_t shared_secret[MLKEM_SHARED_SECRET_BYTES];
+    if (!MLKEM768_decap(shared_secret, ciphertext, sizeof(ciphertext), &priv)) {
+      state.SkipWithError("MLKEM768_decap failed");
+      return;
+    }
+    benchmark::DoNotOptimize(shared_secret);
+  }
+}
+
+void BM_SpeedMLKEM768ParseOnly(benchmark::State &state) {
+  for (auto _ : state) {
+    state.PauseTiming();
+    uint8_t encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES];
+    {
+      MLKEM768_private_key priv;
+      MLKEM768_generate_key(encoded_public_key, nullptr, &priv);
+    }
+    benchmark::DoNotOptimize(encoded_public_key);
+    state.ResumeTiming();
+
+    MLKEM768_public_key pub;
+    CBS encoded_public_key_cbs;
+    CBS_init(&encoded_public_key_cbs, encoded_public_key,
+             sizeof(encoded_public_key));
+    if (!MLKEM768_parse_public_key(&pub, &encoded_public_key_cbs)) {
+      state.SkipWithError("Failure in MLKEM768_parse_public_key.");
+      return;
+    }
+    benchmark::DoNotOptimize(pub);
+  }
+}
+
+void BM_SpeedMLKEM768EncapOnly(benchmark::State &state) {
+  for (auto _ : state) {
+    state.PauseTiming();
+    MLKEM768_public_key pub;
+    {
+      MLKEM768_private_key priv;
+      uint8_t encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES];
+      MLKEM768_generate_key(encoded_public_key, nullptr, &priv);
+      MLKEM768_public_from_private(&pub, &priv);
+    }
+    benchmark::DoNotOptimize(pub);
+    state.ResumeTiming();
+
+    uint8_t shared_secret[MLKEM_SHARED_SECRET_BYTES];
+    uint8_t ciphertext[MLKEM768_CIPHERTEXT_BYTES];
+    MLKEM768_encap(ciphertext, shared_secret, &pub);
+    benchmark::DoNotOptimize(ciphertext);
+    benchmark::DoNotOptimize(shared_secret);
+  }
+}
+
+void BM_SpeedMLKEM1024KeyGenOnly(benchmark::State &state) {
+  for (auto _ : state) {
+    MLKEM1024_private_key priv;
+    uint8_t encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES];
+    MLKEM1024_generate_key(encoded_public_key, nullptr, &priv);
+    benchmark::DoNotOptimize(encoded_public_key);
+    benchmark::DoNotOptimize(priv);
+  }
+}
+
+void BM_SpeedMLKEM1024DecapOnly(benchmark::State &state) {
+  uint8_t ciphertext[MLKEM1024_CIPHERTEXT_BYTES];
+  // This ciphertext is nonsense, but decap is constant-time so, for the
+  // purposes of timing, it's fine.
+  memset(ciphertext, 42, sizeof(ciphertext));
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    MLKEM1024_private_key priv;
+    {
+      uint8_t encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES];
+      MLKEM1024_generate_key(encoded_public_key, nullptr, &priv);
+    }
+    benchmark::DoNotOptimize(priv);
+    state.ResumeTiming();
+
+    uint8_t shared_secret[MLKEM_SHARED_SECRET_BYTES];
+    if (!MLKEM1024_decap(shared_secret, ciphertext, sizeof(ciphertext),
+                         &priv)) {
+      state.SkipWithError("MLKEM1024_decap failed");
+      return;
+    }
+    benchmark::DoNotOptimize(shared_secret);
+  }
+}
+
+void BM_SpeedMLKEM1024ParseOnly(benchmark::State &state) {
+  for (auto _ : state) {
+    state.PauseTiming();
+    uint8_t encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES];
+    {
+      // On heap to avoid stack frame size limit.
+      auto priv = std::make_unique<MLKEM1024_private_key>();
+      MLKEM1024_generate_key(encoded_public_key, nullptr, &*priv);
+    }
+    benchmark::DoNotOptimize(encoded_public_key);
+    state.ResumeTiming();
+
+    MLKEM1024_public_key pub;
+    CBS encoded_public_key_cbs;
+    CBS_init(&encoded_public_key_cbs, encoded_public_key,
+             sizeof(encoded_public_key));
+    if (!MLKEM1024_parse_public_key(&pub, &encoded_public_key_cbs)) {
+      state.SkipWithError("Failure in MLKEM1024_parse_public_key.");
+      return;
+    }
+    benchmark::DoNotOptimize(pub);
+  }
+}
+
+void BM_SpeedMLKEM1024EncapOnly(benchmark::State &state) {
+  for (auto _ : state) {
+    state.PauseTiming();
+    MLKEM1024_public_key pub;
+    {
+      // On heap to avoid stack frame size limit.
+      auto priv = std::make_unique<MLKEM1024_private_key>();
+      uint8_t encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES];
+      MLKEM1024_generate_key(encoded_public_key, nullptr, &*priv);
+      MLKEM1024_public_from_private(&pub, &*priv);
+    }
+    benchmark::DoNotOptimize(pub);
+    state.ResumeTiming();
+
+    uint8_t shared_secret[MLKEM_SHARED_SECRET_BYTES];
+    uint8_t ciphertext[MLKEM1024_CIPHERTEXT_BYTES];
+    MLKEM1024_encap(ciphertext, shared_secret, &pub);
     benchmark::DoNotOptimize(ciphertext);
     benchmark::DoNotOptimize(shared_secret);
   }
@@ -114,6 +292,15 @@
   BENCHMARK(BM_SpeedMLKEM768ParseEncacp)->Apply(bssl::bench::SetThreads);
   BENCHMARK(BM_SpeedMLKEM1024KeyGenDecap)->Apply(bssl::bench::SetThreads);
   BENCHMARK(BM_SpeedMLKEM1024ParseEncacp)->Apply(bssl::bench::SetThreads);
+
+  BENCHMARK(BM_SpeedMLKEM768KeyGenOnly)->Apply(bssl::bench::SetThreads);
+  BENCHMARK(BM_SpeedMLKEM768DecapOnly)->Apply(bssl::bench::SetThreads);
+  BENCHMARK(BM_SpeedMLKEM768ParseOnly)->Apply(bssl::bench::SetThreads);
+  BENCHMARK(BM_SpeedMLKEM768EncapOnly)->Apply(bssl::bench::SetThreads);
+  BENCHMARK(BM_SpeedMLKEM1024KeyGenOnly)->Apply(bssl::bench::SetThreads);
+  BENCHMARK(BM_SpeedMLKEM1024DecapOnly)->Apply(bssl::bench::SetThreads);
+  BENCHMARK(BM_SpeedMLKEM1024ParseOnly)->Apply(bssl::bench::SetThreads);
+  BENCHMARK(BM_SpeedMLKEM1024EncapOnly)->Apply(bssl::bench::SetThreads);
 }
 
 }  // namespace