Add a multi-threaded mode to bssl speed

This is inspired somewhat from how https://github.com/google/benchmark's
threaded benchmark support works. (It seems to spawn a bunch of threads,
latch them all together, and then run.)

This adds a TimeFunctionParallel which runs multiple copies of the
benchmark in parallel, after waiting for all the threads to synchronize.
Some functions had to be tweaked so they don't write to a single, shared
output buffer.

This probably could use some improvement. In playing with it, the
numbers are pretty unstable. We also don't currently benchmark anything
that captures EVP's internal refcounts. But hopefully it's enough to get
a start. I am able to measure impacts from the PRNG locks at least.

Bug: 570
Change-Id: I92c29a05ba082fc45701afd6f0effe23f7b148bd
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/59845
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
diff --git a/tool/speed.cc b/tool/speed.cc
index 5b02059..e2594db 100644
--- a/tool/speed.cc
+++ b/tool/speed.cc
@@ -57,6 +57,12 @@
 #include <time.h>
 #endif
 
+#if defined(OPENSSL_THREADS)
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#endif
+
 #include "../crypto/ec_extra/internal.h"
 #include "../crypto/fipsmodule/ec/internal.h"
 #include "../crypto/internal.h"
@@ -154,34 +160,37 @@
 static uint64_t g_timeout_seconds = 1;
 static std::vector<size_t> g_chunk_lengths = {16, 256, 1350, 8192, 16384};
 
-static bool TimeFunction(TimeResults *results, std::function<bool()> func) {
+// IterationsBetweenTimeChecks returns the number of iterations of |func| to run
+// in between checking the time, or zero on error.
+static uint32_t IterationsBetweenTimeChecks(std::function<bool()> func) {
+  uint64_t start = time_now();
+  if (!func()) {
+    return 0;
+  }
+  uint64_t delta = time_now() - start;
+  if (delta == 0) {
+    return 250;
+  }
+
+  // Aim for about 100ms between time checks.
+  uint32_t ret = static_cast<double>(100000) / static_cast<double>(delta);
+  if (ret > 1000) {
+    ret = 1000;
+  } else if (ret < 1) {
+    ret = 1;
+  }
+  return ret;
+}
+
+static bool TimeFunctionImpl(TimeResults *results, std::function<bool()> func,
+                             uint32_t iterations_between_time_checks) {
   // total_us is the total amount of time that we'll aim to measure a function
   // for.
   const uint64_t total_us = g_timeout_seconds * 1000000;
-  uint64_t start = time_now(), now, delta;
-
-  if (!func()) {
-    return false;
-  }
-  now = time_now();
-  delta = now - start;
-  unsigned iterations_between_time_checks;
-  if (delta == 0) {
-    iterations_between_time_checks = 250;
-  } else {
-    // Aim for about 100ms between time checks.
-    iterations_between_time_checks =
-        static_cast<double>(100000) / static_cast<double>(delta);
-    if (iterations_between_time_checks > 1000) {
-      iterations_between_time_checks = 1000;
-    } else if (iterations_between_time_checks < 1) {
-      iterations_between_time_checks = 1;
-    }
-  }
-
+  uint64_t start = time_now(), now;
   uint64_t done = 0;
   for (;;) {
-    for (unsigned i = 0; i < iterations_between_time_checks; i++) {
+    for (uint32_t i = 0; i < iterations_between_time_checks; i++) {
       if (!func()) {
         return false;
       }
@@ -199,6 +208,93 @@
   return true;
 }
 
+static bool TimeFunction(TimeResults *results, std::function<bool()> func) {
+  uint32_t iterations_between_time_checks = IterationsBetweenTimeChecks(func);
+  if (iterations_between_time_checks == 0) {
+    return false;
+  }
+
+  return TimeFunctionImpl(results, std::move(func),
+                          iterations_between_time_checks);
+}
+
+#if defined(OPENSSL_THREADS)
+// g_threads is the number of threads to run in parallel benchmarks.
+static int g_threads = 1;
+
+// Latch behaves like C++20 std::latch.
+class Latch {
+ public:
+  explicit Latch(int expected) : expected_(expected) {}
+  Latch(const Latch &) = delete;
+  Latch &operator=(const Latch &) = delete;
+
+  void ArriveAndWait() {
+    std::unique_lock<std::mutex> lock(lock_);
+    expected_--;
+    if (expected_ > 0) {
+      cond_.wait(lock, [&] { return expected_ == 0; });
+    } else {
+      cond_.notify_all();
+    }
+  }
+
+ private:
+  int expected_;
+  std::mutex lock_;
+  std::condition_variable cond_;
+};
+
+static bool TimeFunctionParallel(TimeResults *results,
+                                 std::function<bool()> func) {
+  if (g_threads <= 1) {
+    return TimeFunction(results, std::move(func));
+  }
+
+  uint32_t iterations_between_time_checks = IterationsBetweenTimeChecks(func);
+  if (iterations_between_time_checks == 0) {
+    return false;
+  }
+
+  struct ThreadResult {
+    TimeResults time_result;
+    bool ok = false;
+  };
+  std::vector<ThreadResult> thread_results(g_threads);
+  Latch latch(g_threads);
+  std::vector<std::thread> threads;
+  for (int i = 0; i < g_threads; i++) {
+    threads.emplace_back([&, i] {
+      // Wait for all the threads to be ready before running the benchmark.
+      latch.ArriveAndWait();
+      thread_results[i].ok = TimeFunctionImpl(
+          &thread_results[i].time_result, func, iterations_between_time_checks);
+    });
+  }
+
+  for (auto &thread : threads) {
+    thread.join();
+  }
+
+  results->num_calls = 0;
+  results->us = 0;
+  for (const auto& pair : thread_results) {
+    if (!pair.ok) {
+      return false;
+    }
+    results->num_calls += pair.time_result.num_calls;
+    results->us += pair.time_result.us;
+  }
+  return true;
+}
+
+#else
+static bool TimeFunctionParallel(TimeResults *results,
+                                 std::function<bool()> func) {
+  return TimeFunction(results, std::move(func));
+}
+#endif
+
 static bool SpeedRSA(const std::string &selected) {
   if (!selected.empty() && selected.find("RSA") == std::string::npos) {
     return true;
@@ -224,18 +320,21 @@
       return false;
     }
 
-    std::unique_ptr<uint8_t[]> sig(new uint8_t[RSA_size(key.get())]);
+    static constexpr size_t kMaxSignature = 512;
+    if (RSA_size(key.get()) > kMaxSignature) {
+      abort();
+    }
     const uint8_t fake_sha256_hash[32] = {0};
-    unsigned sig_len;
 
     TimeResults results;
-    if (!TimeFunction(&results,
-                      [&key, &sig, &fake_sha256_hash, &sig_len]() -> bool {
-          // Usually during RSA signing we're using a long-lived |RSA| that has
-          // already had all of its |BN_MONT_CTX|s constructed, so it makes
-          // sense to use |key| directly here.
-          return RSA_sign(NID_sha256, fake_sha256_hash, sizeof(fake_sha256_hash),
-                          sig.get(), &sig_len, key.get());
+    if (!TimeFunctionParallel(&results, [&key, &fake_sha256_hash]() -> bool {
+          // Usually during RSA signing we're using a long-lived |RSA| that
+          // has already had all of its |BN_MONT_CTX|s constructed, so it
+          // makes sense to use |key| directly here.
+          uint8_t out[kMaxSignature];
+          unsigned out_len;
+          return RSA_sign(NID_sha256, fake_sha256_hash,
+                          sizeof(fake_sha256_hash), out, &out_len, key.get());
         })) {
       fprintf(stderr, "RSA_sign failed.\n");
       ERR_print_errors_fp(stderr);
@@ -243,46 +342,51 @@
     }
     results.Print(name + " signing");
 
-    if (!TimeFunction(&results,
-                      [&key, &fake_sha256_hash, &sig, sig_len]() -> bool {
-          return RSA_verify(
-              NID_sha256, fake_sha256_hash, sizeof(fake_sha256_hash),
-              sig.get(), sig_len, key.get());
-        })) {
+    uint8_t sig[kMaxSignature];
+    unsigned sig_len;
+    if (!RSA_sign(NID_sha256, fake_sha256_hash, sizeof(fake_sha256_hash), sig,
+                  &sig_len, key.get())) {
+      return false;
+    }
+    if (!TimeFunctionParallel(
+            &results, [&key, &fake_sha256_hash, &sig, sig_len]() -> bool {
+              return RSA_verify(NID_sha256, fake_sha256_hash,
+                                sizeof(fake_sha256_hash), sig, sig_len,
+                                key.get());
+            })) {
       fprintf(stderr, "RSA_verify failed.\n");
       ERR_print_errors_fp(stderr);
       return false;
     }
     results.Print(name + " verify (same key)");
 
-    if (!TimeFunction(&results,
-                      [&key, &fake_sha256_hash, &sig, sig_len]() -> bool {
-          // Usually during RSA verification we have to parse an RSA key from a
-          // certificate or similar, in which case we'd need to construct a new
-          // RSA key, with a new |BN_MONT_CTX| for the public modulus. If we
-          // were to use |key| directly instead, then these costs wouldn't be
-          // accounted for.
-          bssl::UniquePtr<RSA> verify_key(RSA_new());
-          if (!verify_key) {
-            return false;
-          }
-          verify_key->n = BN_dup(key->n);
-          verify_key->e = BN_dup(key->e);
-          if (!verify_key->n ||
-              !verify_key->e) {
-            return false;
-          }
-          return RSA_verify(NID_sha256, fake_sha256_hash,
-                            sizeof(fake_sha256_hash), sig.get(), sig_len,
-                            verify_key.get());
-        })) {
+    if (!TimeFunctionParallel(
+            &results, [&key, &fake_sha256_hash, &sig, sig_len]() -> bool {
+              // Usually during RSA verification we have to parse an RSA key
+              // from a certificate or similar, in which case we'd need to
+              // construct a new RSA key, with a new |BN_MONT_CTX| for the
+              // public modulus. If we were to use |key| directly instead, then
+              // these costs wouldn't be accounted for.
+              bssl::UniquePtr<RSA> verify_key(RSA_new());
+              if (!verify_key) {
+                return false;
+              }
+              verify_key->n = BN_dup(key->n);
+              verify_key->e = BN_dup(key->e);
+              if (!verify_key->n || !verify_key->e) {
+                return false;
+              }
+              return RSA_verify(NID_sha256, fake_sha256_hash,
+                                sizeof(fake_sha256_hash), sig, sig_len,
+                                verify_key.get());
+            })) {
       fprintf(stderr, "RSA_verify failed.\n");
       ERR_print_errors_fp(stderr);
       return false;
     }
     results.Print(name + " verify (fresh key)");
 
-    if (!TimeFunction(&results, [&]() -> bool {
+    if (!TimeFunctionParallel(&results, [&]() -> bool {
           return bssl::UniquePtr<RSA>(RSA_private_key_from_bytes(
                      kRSAKeys[i].key, kRSAKeys[i].key_len)) != nullptr;
         })) {
@@ -414,6 +518,8 @@
     return false;
   }
 
+  // TODO(davidben): In most cases, this can be |TimeFunctionParallel|, but a
+  // few stateful AEADs must be run serially.
   TimeResults results;
   if (direction == evp_aead_seal) {
     if (!TimeFunction(&results,
@@ -504,7 +610,7 @@
 
   {
     TimeResults results;
-    if (!TimeFunction(&results, [&]() -> bool {
+    if (!TimeFunctionParallel(&results, [&]() -> bool {
           AES_KEY key;
           return AES_set_encrypt_key(kZero, bits, &key) == 0;
         })) {
@@ -521,7 +627,7 @@
     }
     uint8_t block[16] = {0};
     TimeResults results;
-    if (!TimeFunction(&results, [&]() -> bool {
+    if (!TimeFunctionParallel(&results, [&]() -> bool {
           AES_encrypt(block, block, &key);
           return true;
         })) {
@@ -533,7 +639,7 @@
 
   {
     TimeResults results;
-    if (!TimeFunction(&results, [&]() -> bool {
+    if (!TimeFunctionParallel(&results, [&]() -> bool {
           AES_KEY key;
           return AES_set_decrypt_key(kZero, bits, &key) == 0;
         })) {
@@ -550,7 +656,7 @@
     }
     uint8_t block[16] = {0};
     TimeResults results;
-    if (!TimeFunction(&results, [&]() -> bool {
+    if (!TimeFunctionParallel(&results, [&]() -> bool {
           AES_decrypt(block, block, &key);
           return true;
         })) {
@@ -565,7 +671,6 @@
 
 static bool SpeedHashChunk(const EVP_MD *md, std::string name,
                            size_t chunk_len) {
-  bssl::ScopedEVP_MD_CTX ctx;
   uint8_t input[16384] = {0};
 
   if (chunk_len > sizeof(input)) {
@@ -574,10 +679,11 @@
 
   name += ChunkLenSuffix(chunk_len);
   TimeResults results;
-  if (!TimeFunction(&results, [&ctx, md, chunk_len, &input]() -> bool {
+  if (!TimeFunctionParallel(&results, [md, chunk_len, &input]() -> bool {
         uint8_t digest[EVP_MAX_MD_SIZE];
         unsigned int md_len;
 
+        bssl::ScopedEVP_MD_CTX ctx;
         return EVP_DigestInit_ex(ctx.get(), md, NULL /* ENGINE */) &&
                EVP_DigestUpdate(ctx.get(), input, chunk_len) &&
                EVP_DigestFinal_ex(ctx.get(), digest, &md_len);
@@ -607,15 +713,15 @@
 }
 
 static bool SpeedRandomChunk(std::string name, size_t chunk_len) {
-  uint8_t scratch[16384];
-
-  if (chunk_len > sizeof(scratch)) {
+  static constexpr size_t kMaxChunk = 16384;
+  if (chunk_len > kMaxChunk) {
     return false;
   }
 
   name += ChunkLenSuffix(chunk_len);
   TimeResults results;
-  if (!TimeFunction(&results, [chunk_len, &scratch]() -> bool {
+  if (!TimeFunctionParallel(&results, [chunk_len]() -> bool {
+        uint8_t scratch[kMaxChunk];
         RAND_bytes(scratch, chunk_len);
         return true;
       })) {
@@ -667,29 +773,29 @@
   }
 
   TimeResults results;
-  if (!TimeFunction(&results, [nid, peer_value_len, &peer_value]() -> bool {
-        bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
-        if (!key ||
-            !EC_KEY_generate_key(key.get())) {
-          return false;
-        }
-        const EC_GROUP *const group = EC_KEY_get0_group(key.get());
-        bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group));
-        bssl::UniquePtr<EC_POINT> peer_point(EC_POINT_new(group));
-        bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new());
-        bssl::UniquePtr<BIGNUM> x(BN_new());
-        if (!point || !peer_point || !ctx || !x ||
-            !EC_POINT_oct2point(group, peer_point.get(), peer_value.get(),
-                                peer_value_len, ctx.get()) ||
-            !EC_POINT_mul(group, point.get(), nullptr, peer_point.get(),
-                          EC_KEY_get0_private_key(key.get()), ctx.get()) ||
-            !EC_POINT_get_affine_coordinates_GFp(group, point.get(), x.get(),
-                                                 nullptr, ctx.get())) {
-          return false;
-        }
+  if (!TimeFunctionParallel(
+          &results, [nid, peer_value_len, &peer_value]() -> bool {
+            bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
+            if (!key || !EC_KEY_generate_key(key.get())) {
+              return false;
+            }
+            const EC_GROUP *const group = EC_KEY_get0_group(key.get());
+            bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group));
+            bssl::UniquePtr<EC_POINT> peer_point(EC_POINT_new(group));
+            bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new());
+            bssl::UniquePtr<BIGNUM> x(BN_new());
+            if (!point || !peer_point || !ctx || !x ||
+                !EC_POINT_oct2point(group, peer_point.get(), peer_value.get(),
+                                    peer_value_len, ctx.get()) ||
+                !EC_POINT_mul(group, point.get(), nullptr, peer_point.get(),
+                              EC_KEY_get0_private_key(key.get()), ctx.get()) ||
+                !EC_POINT_get_affine_coordinates_GFp(
+                    group, point.get(), x.get(), nullptr, ctx.get())) {
+              return false;
+            }
 
-        return true;
-      })) {
+            return true;
+          })) {
     return false;
   }
 
@@ -709,17 +815,18 @@
     return false;
   }
 
-  uint8_t signature[256];
-  if (ECDSA_size(key.get()) > sizeof(signature)) {
-    return false;
+  static constexpr size_t kMaxSignature = 256;
+  if (ECDSA_size(key.get()) > kMaxSignature) {
+    abort();
   }
   uint8_t digest[20];
   OPENSSL_memset(digest, 42, sizeof(digest));
-  unsigned sig_len;
 
   TimeResults results;
-  if (!TimeFunction(&results, [&key, &signature, &digest, &sig_len]() -> bool {
-        return ECDSA_sign(0, digest, sizeof(digest), signature, &sig_len,
+  if (!TimeFunctionParallel(&results, [&key, &digest]() -> bool {
+        uint8_t out[kMaxSignature];
+        unsigned out_len;
+        return ECDSA_sign(0, digest, sizeof(digest), out, &out_len,
                           key.get()) == 1;
       })) {
     return false;
@@ -727,10 +834,17 @@
 
   results.Print(name + " signing");
 
-  if (!TimeFunction(&results, [&key, &signature, &digest, sig_len]() -> bool {
-        return ECDSA_verify(0, digest, sizeof(digest), signature, sig_len,
-                            key.get()) == 1;
-      })) {
+  uint8_t signature[kMaxSignature];
+  unsigned sig_len;
+  if (!ECDSA_sign(0, digest, sizeof(digest), signature, &sig_len, key.get())) {
+    return false;
+  }
+
+  if (!TimeFunctionParallel(
+          &results, [&key, &signature, &digest, sig_len]() -> bool {
+            return ECDSA_verify(0, digest, sizeof(digest), signature, sig_len,
+                                key.get()) == 1;
+          })) {
     return false;
   }
 
@@ -759,10 +873,8 @@
   }
 
   TimeResults results;
-
-  uint8_t public_key[32], private_key[64];
-
-  if (!TimeFunction(&results, [&public_key, &private_key]() -> bool {
+  if (!TimeFunctionParallel(&results, []() -> bool {
+        uint8_t public_key[32], private_key[64];
         ED25519_keypair(public_key, private_key);
         return true;
       })) {
@@ -771,19 +883,25 @@
 
   results.Print("Ed25519 key generation");
 
+  uint8_t public_key[32], private_key[64];
+  ED25519_keypair(public_key, private_key);
   static const uint8_t kMessage[] = {0, 1, 2, 3, 4, 5};
-  uint8_t signature[64];
 
-  if (!TimeFunction(&results, [&private_key, &signature]() -> bool {
-        return ED25519_sign(signature, kMessage, sizeof(kMessage),
-                            private_key) == 1;
+  if (!TimeFunctionParallel(&results, [&private_key]() -> bool {
+        uint8_t out[64];
+        return ED25519_sign(out, kMessage, sizeof(kMessage), private_key) == 1;
       })) {
     return false;
   }
 
   results.Print("Ed25519 signing");
 
-  if (!TimeFunction(&results, [&public_key, &signature]() -> bool {
+  uint8_t signature[64];
+  if (!ED25519_sign(signature, kMessage, sizeof(kMessage), private_key)) {
+    return false;
+  }
+
+  if (!TimeFunctionParallel(&results, [&public_key, &signature]() -> bool {
         return ED25519_verify(kMessage, sizeof(kMessage), signature,
                               public_key) == 1;
       })) {
@@ -793,7 +911,7 @@
 
   results.Print("Ed25519 verify");
 
-  if (!TimeFunction(&results, []() -> bool {
+  if (!TimeFunctionParallel(&results, []() -> bool {
         uint8_t out[32], in[32];
         OPENSSL_memset(in, 0, sizeof(in));
         X25519_public_from_private(out, in);
@@ -805,7 +923,7 @@
 
   results.Print("Curve25519 base-point multiplication");
 
-  if (!TimeFunction(&results, []() -> bool {
+  if (!TimeFunctionParallel(&results, []() -> bool {
         uint8_t out[32], in1[32], in2[32];
         OPENSSL_memset(in1, 0, sizeof(in1));
         OPENSSL_memset(in2, 0, sizeof(in2));
@@ -845,10 +963,10 @@
     return false;
   }
 
-  if (!TimeFunction(&results, [&alice_msg, alice_msg_len]() -> bool {
-        bssl::UniquePtr<SPAKE2_CTX> bob(SPAKE2_CTX_new(spake2_role_bob,
-                                        kBobName, sizeof(kBobName), kAliceName,
-                                        sizeof(kAliceName)));
+  if (!TimeFunctionParallel(&results, [&alice_msg, alice_msg_len]() -> bool {
+        bssl::UniquePtr<SPAKE2_CTX> bob(
+            SPAKE2_CTX_new(spake2_role_bob, kBobName, sizeof(kBobName),
+                           kAliceName, sizeof(kAliceName)));
         uint8_t bob_msg[SPAKE2_MAX_MSG_SIZE], bob_key[64];
         size_t bob_msg_len, bob_key_len;
         if (!SPAKE2_generate_msg(bob.get(), bob_msg, &bob_msg_len,
@@ -879,7 +997,7 @@
   static const char kPassword[] = "password";
   static const uint8_t kSalt[] = "NaCl";
 
-  if (!TimeFunction(&results, [&]() -> bool {
+  if (!TimeFunctionParallel(&results, [&]() -> bool {
         uint8_t out[64];
         return !!EVP_PBE_scrypt(kPassword, sizeof(kPassword) - 1, kSalt,
                                 sizeof(kSalt) - 1, 1024, 8, 16, 0 /* max_mem */,
@@ -890,7 +1008,7 @@
   }
   results.Print("scrypt (N = 1024, r = 8, p = 16)");
 
-  if (!TimeFunction(&results, [&]() -> bool {
+  if (!TimeFunctionParallel(&results, [&]() -> bool {
         uint8_t out[64];
         return !!EVP_PBE_scrypt(kPassword, sizeof(kPassword) - 1, kSalt,
                                 sizeof(kSalt) - 1, 16384, 8, 1, 0 /* max_mem */,
@@ -911,7 +1029,7 @@
 
   TimeResults results;
 
-  if (!TimeFunction(&results, []() -> bool {
+  if (!TimeFunctionParallel(&results, []() -> bool {
         struct HRSS_public_key pub;
         struct HRSS_private_key priv;
         uint8_t entropy[HRSS_GENERATE_KEY_BYTES];
@@ -932,22 +1050,29 @@
     return false;
   }
 
-  uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
-  if (!TimeFunction(&results, [&pub, &ciphertext]() -> bool {
+  if (!TimeFunctionParallel(&results, [&pub]() -> bool {
         uint8_t entropy[HRSS_ENCAP_BYTES];
         uint8_t shared_key[HRSS_KEY_BYTES];
+        uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
         RAND_bytes(entropy, sizeof(entropy));
         return HRSS_encap(ciphertext, shared_key, &pub, entropy);
       })) {
     fprintf(stderr, "Failed to time HRSS_encap.\n");
     return false;
   }
-
   results.Print("HRSS encap");
 
-  if (!TimeFunction(&results, [&priv, &ciphertext]() -> bool {
-        uint8_t shared_key[HRSS_KEY_BYTES];
-        return HRSS_decap(shared_key, &priv, ciphertext, sizeof(ciphertext));
+  uint8_t entropy[HRSS_ENCAP_BYTES];
+  uint8_t shared_key[HRSS_KEY_BYTES];
+  uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
+  RAND_bytes(entropy, sizeof(entropy));
+  if (!HRSS_encap(ciphertext, shared_key, &pub, entropy)) {
+    return false;
+  }
+
+  if (!TimeFunctionParallel(&results, [&priv, &ciphertext]() -> bool {
+        uint8_t shared_key2[HRSS_KEY_BYTES];
+        return HRSS_decap(shared_key2, &priv, ciphertext, sizeof(ciphertext));
       })) {
     fprintf(stderr, "Failed to time HRSS_encap.\n");
     return false;
@@ -965,39 +1090,39 @@
 
   TimeResults results;
 
-  KYBER_private_key priv;
-  uint8_t encoded_public_key[KYBER_PUBLIC_KEY_BYTES];
   uint8_t ciphertext[KYBER_CIPHERTEXT_BYTES];
   // This ciphertext is nonsense, but Kyber decap is constant-time so, for the
   // purposes of timing, it's fine.
   memset(ciphertext, 42, sizeof(ciphertext));
-  if (!TimeFunction(&results,
-                    [&priv, &encoded_public_key, &ciphertext]() -> bool {
-                      uint8_t shared_secret[32];
-                      KYBER_generate_key(encoded_public_key, &priv);
-                      KYBER_decap(shared_secret, sizeof(shared_secret),
-                                  ciphertext, &priv);
-                      return true;
-                    })) {
+  if (!TimeFunctionParallel(&results, [&]() -> bool {
+        KYBER_private_key priv;
+        uint8_t encoded_public_key[KYBER_PUBLIC_KEY_BYTES];
+        KYBER_generate_key(encoded_public_key, &priv);
+        uint8_t shared_secret[32];
+        KYBER_decap(shared_secret, sizeof(shared_secret), ciphertext, &priv);
+        return true;
+      })) {
     fprintf(stderr, "Failed to time KYBER_generate_key + KYBER_decap.\n");
     return false;
   }
 
   results.Print("Kyber generate + decap");
 
+  KYBER_private_key priv;
+  uint8_t encoded_public_key[KYBER_PUBLIC_KEY_BYTES];
+  KYBER_generate_key(encoded_public_key, &priv);
   KYBER_public_key pub;
-  if (!TimeFunction(
-          &results, [&pub, &ciphertext, &encoded_public_key]() -> bool {
-            CBS encoded_public_key_cbs;
-            CBS_init(&encoded_public_key_cbs, encoded_public_key,
-                     sizeof(encoded_public_key));
-            if (!KYBER_parse_public_key(&pub, &encoded_public_key_cbs)) {
-              return false;
-            }
-            uint8_t shared_secret[32];
-            KYBER_encap(ciphertext, shared_secret, sizeof(shared_secret), &pub);
-            return true;
-          })) {
+  if (!TimeFunctionParallel(&results, [&]() -> bool {
+        CBS encoded_public_key_cbs;
+        CBS_init(&encoded_public_key_cbs, encoded_public_key,
+                 sizeof(encoded_public_key));
+        if (!KYBER_parse_public_key(&pub, &encoded_public_key_cbs)) {
+          return false;
+        }
+        uint8_t shared_secret[32];
+        KYBER_encap(ciphertext, shared_secret, sizeof(shared_secret), &pub);
+        return true;
+      })) {
     fprintf(stderr, "Failed to time KYBER_encap.\n");
     return false;
   }
@@ -1023,7 +1148,7 @@
     if (p256 == NULL) {
       return false;
     }
-    if (!TimeFunction(&results, [&]() -> bool {
+    if (!TimeFunctionParallel(&results, [&]() -> bool {
           EC_JACOBIAN out;
           return ec_hash_to_curve_p256_xmd_sha256_sswu(
               p256, &out, kLabel, sizeof(kLabel), input, sizeof(input));
@@ -1037,7 +1162,7 @@
     if (p384 == NULL) {
       return false;
     }
-    if (!TimeFunction(&results, [&]() -> bool {
+    if (!TimeFunctionParallel(&results, [&]() -> bool {
           EC_JACOBIAN out;
           return ec_hash_to_curve_p384_xmd_sha384_sswu(
               p384, &out, kLabel, sizeof(kLabel), input, sizeof(input));
@@ -1047,7 +1172,7 @@
     }
     results.Print("hash-to-curve P384_XMD:SHA-384_SSWU_RO_");
 
-    if (!TimeFunction(&results, [&]() -> bool {
+    if (!TimeFunctionParallel(&results, [&]() -> bool {
           EC_SCALAR out;
           return ec_hash_to_scalar_p384_xmd_sha512_draft07(
               p384, &out, kLabel, sizeof(kLabel), input, sizeof(input));
@@ -1088,11 +1213,11 @@
     "HWy+iMf6/7p/Ak/SIicM4XSwmlQ8pPxAZPr+E2LoVd9pMpWUwpW2UbtO5wsGTrY5"
     "sO45tFNN/y+jtUheB1C2ijObG/tXELaiyCdM+S/waeuv0MXtI4xnn1A=";
 
-  std::vector<uint8_t> out(strlen(kInput));
-  size_t len;
   TimeResults results;
-  if (!TimeFunction(&results, [&]() -> bool {
-        return EVP_DecodeBase64(out.data(), &len, out.size(),
+  if (!TimeFunctionParallel(&results, [&]() -> bool {
+        uint8_t out[sizeof(kInput)];
+        size_t len;
+        return EVP_DecodeBase64(out, &len, sizeof(out),
                                 reinterpret_cast<const uint8_t *>(kInput),
                                 strlen(kInput));
       })) {
@@ -1112,7 +1237,7 @@
   for (size_t len : g_chunk_lengths) {
     std::vector<uint8_t> input(len);
     TimeResults results;
-    if (!TimeFunction(&results, [&]() -> bool {
+    if (!TimeFunctionParallel(&results, [&]() -> bool {
           SIPHASH_24(key, input.data(), input.size());
           return true;
         })) {
@@ -1383,6 +1508,13 @@
         "there is no information about the bytes per call for an  operation, "
         "the JSON field for bytesPerCall will be omitted.",
     },
+#if defined(OPENSSL_THREADS)
+    {
+        "-threads",
+        kOptionalArgument,
+        "The number of threads to benchmark in parallel (default is 1)",
+    },
+#endif
     {
         "",
         kOptionalArgument,
@@ -1410,6 +1542,12 @@
     g_timeout_seconds = atoi(args_map["-timeout"].c_str());
   }
 
+#if defined(OPENSSL_THREADS)
+  if (args_map.count("-threads") != 0) {
+    g_threads = atoi(args_map["-threads"].c_str());
+  }
+#endif
+
   if (args_map.count("-chunks") != 0) {
     g_chunk_lengths.clear();
     const char *start = args_map["-chunks"].data();