Remove pooling of PRNG state.

Prior to 82639e6f we used thread-local data for the PRNG state. That
change switched to using a mutex-protected pool instead in order to save
memory in heavily-threaded applications.

However, the pool mutex can get extremely hot in cases where the PRNG is
heavily used. 8e8f2504 was a short-term work around, but supporting both
modes is overly complex.

This change moves back to the state of the prior to 82639e6f. The best
way to review this is to diff the changed files against '82639e6f^' and
note that the only difference is a comment added in rand.c:
https://paste.googleplex.com/4997991748337664

Change-Id: I8febce089696fa6bc39f94f4a1e268127a8f78db
Reviewed-on: https://boringssl-review.googlesource.com/c/34024
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/rand/rand.c b/crypto/fipsmodule/rand/rand.c
index bff2471..4128033 100644
--- a/crypto/fipsmodule/rand/rand.c
+++ b/crypto/fipsmodule/rand/rand.c
@@ -54,6 +54,75 @@
 // continuous random number generator test in FIPS 140-2, section 4.9.2.
 #define CRNGT_BLOCK_SIZE 16
 
+// rand_thread_state contains the per-thread state for the RNG.
+struct rand_thread_state {
+  CTR_DRBG_STATE drbg;
+  // calls is the number of generate calls made on |drbg| since it was last
+  // (re)seeded. This is bound by |kReseedInterval|.
+  unsigned calls;
+  // last_block_valid is non-zero iff |last_block| contains data from
+  // |CRYPTO_sysrand|.
+  int last_block_valid;
+
+#if defined(BORINGSSL_FIPS)
+  // last_block contains the previous block from |CRYPTO_sysrand|.
+  uint8_t last_block[CRNGT_BLOCK_SIZE];
+  // next and prev form a NULL-terminated, double-linked list of all states in
+  // a process.
+  struct rand_thread_state *next, *prev;
+#endif
+};
+
+#if defined(BORINGSSL_FIPS)
+// thread_states_list is the head of a linked-list of all |rand_thread_state|
+// objects in the process, one per thread. This is needed because FIPS requires
+// that they be zeroed on process exit, but thread-local destructors aren't
+// called when the whole process is exiting.
+DEFINE_BSS_GET(struct rand_thread_state *, thread_states_list);
+DEFINE_STATIC_MUTEX(thread_states_list_lock);
+
+static void rand_thread_state_clear_all(void) __attribute__((destructor));
+static void rand_thread_state_clear_all(void) {
+  CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get());
+  for (struct rand_thread_state *cur = *thread_states_list_bss_get();
+       cur != NULL; cur = cur->next) {
+    CTR_DRBG_clear(&cur->drbg);
+  }
+  // |thread_states_list_lock is deliberately left locked so that any threads
+  // that are still running will hang if they try to call |RAND_bytes|.
+}
+#endif
+
+// rand_thread_state_free frees a |rand_thread_state|. This is called when a
+// thread exits.
+static void rand_thread_state_free(void *state_in) {
+  struct rand_thread_state *state = state_in;
+
+  if (state_in == NULL) {
+    return;
+  }
+
+#if defined(BORINGSSL_FIPS)
+  CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get());
+
+  if (state->prev != NULL) {
+    state->prev->next = state->next;
+  } else {
+    *thread_states_list_bss_get() = state->next;
+  }
+
+  if (state->next != NULL) {
+    state->next->prev = state->prev;
+  }
+
+  CRYPTO_STATIC_MUTEX_unlock_write(thread_states_list_lock_bss_get());
+
+  CTR_DRBG_clear(&state->drbg);
+#endif
+
+  OPENSSL_free(state);
+}
+
 #if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \
     !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE)
 
@@ -103,36 +172,9 @@
 
 #endif
 
-// rand_state contains an RNG state. State object are managed in one of two
-// ways, depending on whether |RAND_enable_fork_unsafe_buffering| has been
-// called: if it has been called then thread-local storage is used to keep a
-// per-thread state. Otherwise a mutex-protected pool of state objects is used.
-struct rand_state {
-  CTR_DRBG_STATE drbg;
-  // next forms a NULL-terminated linked-list of all free |rand_state| objects
-  // in a pool. This is unused if using thread-local states.
-  struct rand_state *next;
-  // calls is the number of generate calls made on |drbg| since it was last
-  // (re)seeded. This is bound by
-  // |kReseedInterval - 1 + SIZE_MAX / CTR_DRBG_MAX_GENERATE_LENGTH|.
-  size_t calls;
-
-#if defined(BORINGSSL_FIPS)
-  // prev_all and next_all form another NULL-terminated linked-list, this time
-  // of all |rand_state| objects that have been allocated including those that
-  // might currently be in use.
-  struct rand_state *prev_all, *next_all;
-  // last_block contains the previous block from |CRYPTO_sysrand|.
-  uint8_t last_block[CRNGT_BLOCK_SIZE];
-  // last_block_valid is non-zero iff |last_block| contains data from
-  // |CRYPTO_sysrand|.
-  int last_block_valid;
-#endif
-};
-
 #if defined(BORINGSSL_FIPS)
 
-static void rand_get_seed(struct rand_state *state,
+static void rand_get_seed(struct rand_thread_state *state,
                           uint8_t seed[CTR_DRBG_ENTROPY_LEN]) {
   if (!state->last_block_valid) {
     if (!hwrand(state->last_block, sizeof(state->last_block))) {
@@ -181,7 +223,7 @@
 
 #else
 
-static void rand_get_seed(struct rand_state *state,
+static void rand_get_seed(struct rand_thread_state *state,
                           uint8_t seed[CTR_DRBG_ENTROPY_LEN]) {
   // If not in FIPS mode, we don't overread from the system entropy source and
   // we don't depend only on the hardware RDRAND.
@@ -190,156 +232,12 @@
 
 #endif
 
-// rand_state_free_list is a list of currently free, |rand_state| structures.
-// (It is only used if a mutex-pool is being used to manage |rand_state|
-// objects.) When a thread needs a |rand_state| it picks the head element of
-// this list and allocs a new one if the list is empty. Once it's finished, it
-// pushes the state back onto the front of the list.
-//
-// Since we don't free |rand_state| objects, the number of objects in memory
-// will eventually equal the maximum concurrency of |RAND_bytes| in the
-// mutex-pool model.
-DEFINE_BSS_GET(struct rand_state *, rand_state_free_list);
-
-// rand_state_lock protects |rand_state_free_list| (and |rand_state_all_list|,
-// in FIPS mode).
-DEFINE_STATIC_MUTEX(rand_state_lock);
-
-#if defined(BORINGSSL_FIPS)
-// rand_state_all_list is the head of a linked-list of all |rand_state| objects
-// in the process. This is needed because FIPS requires that they be zeroed on
-// process exit.
-DEFINE_BSS_GET(struct rand_state *, rand_state_all_list);
-
-// rand_drbg_lock is taken in write mode by |rand_state_clear_all|, and
-// in read mode by any operation on the |drbg| member of |rand_state|.
-// This ensures that, in the event that a thread races destructor functions, we
-// never return bogus random data. At worst, the thread will deadlock.
-DEFINE_STATIC_MUTEX(rand_drbg_lock);
-
-static void rand_state_clear_all(void) __attribute__((destructor));
-static void rand_state_clear_all(void) {
-  CRYPTO_STATIC_MUTEX_lock_write(rand_drbg_lock_bss_get());
-  CRYPTO_STATIC_MUTEX_lock_write(rand_state_lock_bss_get());
-  for (struct rand_state *cur = *rand_state_all_list_bss_get();
-       cur != NULL; cur = cur->next_all) {
-    CTR_DRBG_clear(&cur->drbg);
-  }
-  // Both locks are deliberately left locked so that any threads that are still
-  // running will hang if they try to call |RAND_bytes|.
-}
-#endif
-
-// rand_state_free frees a |rand_state|. This is called when a thread exits if
-// we're using thread-local states.
-static void rand_state_free(void *state_in) {
-  struct rand_state *state = state_in;
-  if (state_in == NULL) {
-    return;
-  }
-
-#if defined(BORINGSSL_FIPS)
-  CRYPTO_STATIC_MUTEX_lock_write(rand_state_lock_bss_get());
-  if (state->prev_all != NULL) {
-    state->prev_all->next_all = state->next_all;
-  } else {
-    *rand_state_all_list_bss_get() = state->next_all;
-  }
-
-  if (state->next_all != NULL) {
-    state->next_all->prev_all = state->prev_all;
-  }
-  CRYPTO_STATIC_MUTEX_unlock_write(rand_state_lock_bss_get());
-
-  CTR_DRBG_clear(&state->drbg);
-#endif
-
-  OPENSSL_free(state);
-}
-
-// rand_state_init seeds a |rand_state|.
-static void rand_state_init(struct rand_state *state) {
-  OPENSSL_memset(state, 0, sizeof(struct rand_state));
-  uint8_t seed[CTR_DRBG_ENTROPY_LEN];
-  rand_get_seed(state, seed);
-  if (!CTR_DRBG_init(&state->drbg, seed, NULL, 0)) {
-    abort();
-  }
-}
-
-// rand_state_get returns a usable |rand_state|, or NULL if memory is exhausted.
-//
-// If a pool is being used, it pops a |rand_state| from the head of
-// |rand_state_free_list| and returns it. If the list is empty, it
-// creates a fresh |rand_state| and returns that instead.
-//
-// Alternatively, if thread-local states are being used, it returns the current
-// thread's state object and creates it if needed.
-static struct rand_state *rand_state_get(const int fork_unsafe_buffering) {
-  struct rand_state *state = NULL;
-  if (fork_unsafe_buffering) {
-    // Thread-local storage is used in this case. This is unrelated to fork-
-    // safety and we are overloading this global control to also identify
-    // processes that really care about PRNG speed.
-    state = CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_RAND);
-  } else {
-    // Otherwise a mutex-protected pool of states is used.
-    CRYPTO_STATIC_MUTEX_lock_write(rand_state_lock_bss_get());
-    state = *rand_state_free_list_bss_get();
-    if (state != NULL) {
-      *rand_state_free_list_bss_get() = state->next;
-    }
-    CRYPTO_STATIC_MUTEX_unlock_write(rand_state_lock_bss_get());
-  }
-
-  if (state != NULL) {
-    return state;
-  }
-
-  state = OPENSSL_malloc(sizeof(struct rand_state));
-  if (state == NULL) {
-    return NULL;
-  }
-
-  rand_state_init(state);
-
-#if defined(BORINGSSL_FIPS)
-  CRYPTO_STATIC_MUTEX_lock_write(rand_state_lock_bss_get());
-  state->next_all = *rand_state_all_list_bss_get();
-  if (state->next_all) {
-    state->next_all->prev_all = state;
-  }
-  *rand_state_all_list_bss_get() = state;
-  CRYPTO_STATIC_MUTEX_unlock_write(rand_state_lock_bss_get());
-#endif
-
-  if (fork_unsafe_buffering &&
-      !CRYPTO_set_thread_local(OPENSSL_THREAD_LOCAL_RAND, state,
-                               rand_state_free)) {
-    rand_state_free(state);
-    return NULL;
-  }
-
-  return state;
-}
-
-// rand_state_put pushes |state| onto |rand_state_free_list| if the pool is
-// being used. May only be called if the pool is being used.
-static void rand_state_put(struct rand_state *state) {
-  CRYPTO_STATIC_MUTEX_lock_write(rand_state_lock_bss_get());
-  state->next = *rand_state_free_list_bss_get();
-  *rand_state_free_list_bss_get() = state;
-  CRYPTO_STATIC_MUTEX_unlock_write(rand_state_lock_bss_get());
-}
-
 void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len,
                                      const uint8_t user_additional_data[32]) {
   if (out_len == 0) {
     return;
   }
 
-  const int fork_unsafe_buffering = rand_fork_unsafe_buffering_enabled();
-
   // Additional data is mixed into every CTR-DRBG call to protect, as best we
   // can, against forks & VM clones. We do not over-read this information and
   // don't reseed with it so, from the point of view of FIPS, this doesn't
@@ -350,7 +248,7 @@
     // entropy is used. This can be expensive (one read per |RAND_bytes| call)
     // and so can be disabled by applications that we have ensured don't fork
     // and aren't at risk of VM cloning.
-    if (!fork_unsafe_buffering) {
+    if (!rand_fork_unsafe_buffering_enabled()) {
       CRYPTO_sysrand(additional_data, sizeof(additional_data));
     } else {
       OPENSSL_memset(additional_data, 0, sizeof(additional_data));
@@ -361,14 +259,41 @@
     additional_data[i] ^= user_additional_data[i];
   }
 
-  struct rand_state stack_state;
-  struct rand_state *state = rand_state_get(fork_unsafe_buffering);
+  struct rand_thread_state stack_state;
+  struct rand_thread_state *state =
+      CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_RAND);
 
   if (state == NULL) {
-    // If the system is out of memory, use an ephemeral state on the
-    // stack.
-    state = &stack_state;
-    rand_state_init(state);
+    state = OPENSSL_malloc(sizeof(struct rand_thread_state));
+    if (state == NULL ||
+        !CRYPTO_set_thread_local(OPENSSL_THREAD_LOCAL_RAND, state,
+                                 rand_thread_state_free)) {
+      // If the system is out of memory, use an ephemeral state on the
+      // stack.
+      state = &stack_state;
+    }
+
+    state->last_block_valid = 0;
+    uint8_t seed[CTR_DRBG_ENTROPY_LEN];
+    rand_get_seed(state, seed);
+    if (!CTR_DRBG_init(&state->drbg, seed, NULL, 0)) {
+      abort();
+    }
+    state->calls = 0;
+
+#if defined(BORINGSSL_FIPS)
+    if (state != &stack_state) {
+      CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get());
+      struct rand_thread_state **states_list = thread_states_list_bss_get();
+      state->next = *states_list;
+      if (state->next != NULL) {
+        state->next->prev = state;
+      }
+      state->prev = NULL;
+      *states_list = state;
+      CRYPTO_STATIC_MUTEX_unlock_write(thread_states_list_lock_bss_get());
+    }
+#endif
   }
 
   if (state->calls >= kReseedInterval) {
@@ -377,13 +302,13 @@
 #if defined(BORINGSSL_FIPS)
     // Take a read lock around accesses to |state->drbg|. This is needed to
     // avoid returning bad entropy if we race with
-    // |rand_state_clear_all|.
+    // |rand_thread_state_clear_all|.
     //
     // This lock must be taken after any calls to |CRYPTO_sysrand| to avoid a
     // bug on ppc64le. glibc may implement pthread locks by wrapping user code
     // in a hardware transaction, but, on some older versions of glibc and the
     // kernel, syscalls made with |syscall| did not abort the transaction.
-    CRYPTO_STATIC_MUTEX_lock_read(rand_drbg_lock_bss_get());
+    CRYPTO_STATIC_MUTEX_lock_read(thread_states_list_lock_bss_get());
 #endif
     if (!CTR_DRBG_reseed(&state->drbg, seed, NULL, 0)) {
       abort();
@@ -391,7 +316,7 @@
     state->calls = 0;
   } else {
 #if defined(BORINGSSL_FIPS)
-    CRYPTO_STATIC_MUTEX_lock_read(rand_drbg_lock_bss_get());
+    CRYPTO_STATIC_MUTEX_lock_read(thread_states_list_lock_bss_get());
 #endif
   }
 
@@ -420,12 +345,8 @@
   }
 
 #if defined(BORINGSSL_FIPS)
-  CRYPTO_STATIC_MUTEX_unlock_read(rand_drbg_lock_bss_get());
+  CRYPTO_STATIC_MUTEX_unlock_read(thread_states_list_lock_bss_get());
 #endif
-
-  if (!fork_unsafe_buffering && state != &stack_state) {
-    rand_state_put(state);
-  }
 }
 
 int RAND_bytes(uint8_t *out, size_t out_len) {
diff --git a/crypto/rand_extra/forkunsafe.c b/crypto/rand_extra/forkunsafe.c
index 27921f0..0f1ecec 100644
--- a/crypto/rand_extra/forkunsafe.c
+++ b/crypto/rand_extra/forkunsafe.c
@@ -21,12 +21,9 @@
 
 // g_buffering_enabled is true if fork-unsafe buffering has been enabled.
 static int g_buffering_enabled = 0;
-static CRYPTO_once_t g_buffering_enabled_once = CRYPTO_ONCE_INIT;
-static int g_buffering_enabled_pending = 0;
 
-static void g_buffer_enabled_init(void) {
-  g_buffering_enabled = g_buffering_enabled_pending;
-}
+// g_lock protects |g_buffering_enabled|.
+static struct CRYPTO_STATIC_MUTEX g_lock = CRYPTO_STATIC_MUTEX_INIT;
 
 #if !defined(OPENSSL_WINDOWS)
 void RAND_enable_fork_unsafe_buffering(int fd) {
@@ -35,16 +32,15 @@
     abort();
   }
 
-  g_buffering_enabled_pending = 1;
-  CRYPTO_once(&g_buffering_enabled_once, g_buffer_enabled_init);
-  if (g_buffering_enabled != 1) {
-    // RAND_bytes has been called before this function.
-    abort();
-  }
+  CRYPTO_STATIC_MUTEX_lock_write(&g_lock);
+  g_buffering_enabled = 1;
+  CRYPTO_STATIC_MUTEX_unlock_write(&g_lock);
 }
 #endif
 
 int rand_fork_unsafe_buffering_enabled(void) {
-  CRYPTO_once(&g_buffering_enabled_once, g_buffer_enabled_init);
-  return g_buffering_enabled;
+  CRYPTO_STATIC_MUTEX_lock_read(&g_lock);
+  const int ret = g_buffering_enabled;
+  CRYPTO_STATIC_MUTEX_unlock_read(&g_lock);
+  return ret;
 }
diff --git a/include/openssl/rand.h b/include/openssl/rand.h
index c6527b3..5d02e12 100644
--- a/include/openssl/rand.h
+++ b/include/openssl/rand.h
@@ -57,10 +57,6 @@
 // ownership of |fd|. If |fd| is negative then /dev/urandom will be opened and
 // any error from open(2) crashes the address space.
 //
-// Setting this also enables thread-local PRNG state, which can reduce lock
-// contention in highly-threaded applications although at the cost of yet more
-// memory.
-//
 // It has an unusual name because the buffer is unsafe across calls to |fork|.
 // Hence, this function should never be called by libraries.
 OPENSSL_EXPORT void RAND_enable_fork_unsafe_buffering(int fd);