Use a pool of |rand_state| objects.

Previously we used thread-local state objects in rand.c. However, for
applications with large numbers of threads, this can lead to excessive
memory usage.

This change causes us to maintain a mutex-protected pool of state
objects where the size of the pool equals the maximum concurrency of
|RAND_bytes|. This might lead to state objects bouncing between CPUs
more often, but should help the memory usage problem.

Change-Id: Ie83763d3bc139e64ac17bf7e015ad082b2f8a81a
Reviewed-on: https://boringssl-review.googlesource.com/29565
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/delocate.h b/crypto/fipsmodule/delocate.h
index 065a21c..59effde 100644
--- a/crypto/fipsmodule/delocate.h
+++ b/crypto/fipsmodule/delocate.h
@@ -23,7 +23,7 @@
 #if defined(BORINGSSL_FIPS) && !defined(OPENSSL_ASAN) && !defined(OPENSSL_MSAN)
 #define DEFINE_BSS_GET(type, name)        \
   static type name __attribute__((used)); \
-  type *name##_bss_get(void);
+  type *name##_bss_get(void) __attribute__((const));
 // For FIPS builds we require that CRYPTO_ONCE_INIT be zero.
 #define DEFINE_STATIC_ONCE(name) DEFINE_BSS_GET(CRYPTO_once_t, name)
 // For FIPS builds we require that CRYPTO_STATIC_MUTEX_INIT be zero.
diff --git a/crypto/fipsmodule/rand/rand.c b/crypto/fipsmodule/rand/rand.c
index 3ec92e6..02e63bc 100644
--- a/crypto/fipsmodule/rand/rand.c
+++ b/crypto/fipsmodule/rand/rand.c
@@ -54,75 +54,6 @@
 // continuous random number generator test in FIPS 140-2, section 4.9.2.
 #define CRNGT_BLOCK_SIZE 16
 
-// rand_thread_state contains the per-thread state for the RNG.
-struct rand_thread_state {
-  CTR_DRBG_STATE drbg;
-  // calls is the number of generate calls made on |drbg| since it was last
-  // (re)seeded. This is bound by |kReseedInterval|.
-  unsigned calls;
-  // last_block_valid is non-zero iff |last_block| contains data from
-  // |CRYPTO_sysrand|.
-  int last_block_valid;
-
-#if defined(BORINGSSL_FIPS)
-  // last_block contains the previous block from |CRYPTO_sysrand|.
-  uint8_t last_block[CRNGT_BLOCK_SIZE];
-  // next and prev form a NULL-terminated, double-linked list of all states in
-  // a process.
-  struct rand_thread_state *next, *prev;
-#endif
-};
-
-#if defined(BORINGSSL_FIPS)
-// thread_states_list is the head of a linked-list of all |rand_thread_state|
-// objects in the process, one per thread. This is needed because FIPS requires
-// that they be zeroed on process exit, but thread-local destructors aren't
-// called when the whole process is exiting.
-DEFINE_BSS_GET(struct rand_thread_state *, thread_states_list);
-DEFINE_STATIC_MUTEX(thread_states_list_lock);
-
-static void rand_thread_state_clear_all(void) __attribute__((destructor));
-static void rand_thread_state_clear_all(void) {
-  CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get());
-  for (struct rand_thread_state *cur = *thread_states_list_bss_get();
-       cur != NULL; cur = cur->next) {
-    CTR_DRBG_clear(&cur->drbg);
-  }
-  // |thread_states_list_lock is deliberately left locked so that any threads
-  // that are still running will hang if they try to call |RAND_bytes|.
-}
-#endif
-
-// rand_thread_state_free frees a |rand_thread_state|. This is called when a
-// thread exits.
-static void rand_thread_state_free(void *state_in) {
-  struct rand_thread_state *state = state_in;
-
-  if (state_in == NULL) {
-    return;
-  }
-
-#if defined(BORINGSSL_FIPS)
-  CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get());
-
-  if (state->prev != NULL) {
-    state->prev->next = state->next;
-  } else {
-    *thread_states_list_bss_get() = state->next;
-  }
-
-  if (state->next != NULL) {
-    state->next->prev = state->prev;
-  }
-
-  CRYPTO_STATIC_MUTEX_unlock_write(thread_states_list_lock_bss_get());
-
-  CTR_DRBG_clear(&state->drbg);
-#endif
-
-  OPENSSL_free(state);
-}
-
 #if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \
     !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE)
 
@@ -172,9 +103,31 @@
 
 #endif
 
+// rand_state contains an RNG state.
+struct rand_state {
+  CTR_DRBG_STATE drbg;
+  // next forms a NULL-terminated linked-list of all free |rand_state| objects.
+  struct rand_state *next;
+  // calls is the number of generate calls made on |drbg| since it was last
+  // (re)seeded. This is bound by |kReseedInterval|.
+  unsigned calls;
+
+#if defined(BORINGSSL_FIPS)
+  // next_all forms another NULL-terminated linked-list, this time of all
+  // |rand_state| objects that have been allocated including those that might
+  // currently be in use.
+  struct rand_state *next_all;
+  // last_block contains the previous block from |CRYPTO_sysrand|.
+  uint8_t last_block[CRNGT_BLOCK_SIZE];
+  // last_block_valid is non-zero iff |last_block| contains data from
+  // |CRYPTO_sysrand|.
+  int last_block_valid;
+#endif
+};
+
 #if defined(BORINGSSL_FIPS)
 
-static void rand_get_seed(struct rand_thread_state *state,
+static void rand_get_seed(struct rand_state *state,
                           uint8_t seed[CTR_DRBG_ENTROPY_LEN]) {
   if (!state->last_block_valid) {
     if (!hwrand(state->last_block, sizeof(state->last_block))) {
@@ -223,7 +176,7 @@
 
 #else
 
-static void rand_get_seed(struct rand_thread_state *state,
+static void rand_get_seed(struct rand_state *state,
                           uint8_t seed[CTR_DRBG_ENTROPY_LEN]) {
   // If not in FIPS mode, we don't overread from the system entropy source and
   // we don't depend only on the hardware RDRAND.
@@ -232,6 +185,97 @@
 
 #endif
 
+// rand_state_free_list is a list of currently free, |rand_state| structures.
+// When a thread needs a |rand_state| it picks the head element of this list and
+// allocs a new one if the list is empty. Once it's finished, it pushes the
+// state back onto the front of the list.
+//
+// Previously we used a thread-local state but for processes with large numbers
+// of threads this can result in excessive memory usage. Since we don't free
+// |rand_state| objects, the number of objects in memory will eventually equal
+// the maximum concurrency of |RAND_bytes|.
+DEFINE_BSS_GET(struct rand_state *, rand_state_free_list);
+
+// rand_state_lock protects |rand_state_free_list| (and |rand_state_all_list|,
+// in FIPS mode).
+DEFINE_STATIC_MUTEX(rand_state_lock);
+
+#if defined(BORINGSSL_FIPS)
+// rand_state_all_list is the head of a linked-list of all |rand_state| objects
+// in the process. This is needed because FIPS requires that they be zeroed on
+// process exit.
+DEFINE_BSS_GET(struct rand_state *, rand_state_all_list);
+
+// rand_drbg_lock is taken in write mode by |rand_state_clear_all|, and
+// in read mode by any operation on the |drbg| member of |rand_state|.
+// This ensures that, in the event that a thread races destructor functions, we
+// never return bogus random data. At worst, the thread will deadlock.
+DEFINE_STATIC_MUTEX(rand_drbg_lock);
+
+static void rand_state_clear_all(void) __attribute__((destructor));
+static void rand_state_clear_all(void) {
+  CRYPTO_STATIC_MUTEX_lock_write(rand_drbg_lock_bss_get());
+  CRYPTO_STATIC_MUTEX_lock_write(rand_state_lock_bss_get());
+  for (struct rand_state *cur = *rand_state_all_list_bss_get();
+       cur != NULL; cur = cur->next_all) {
+    CTR_DRBG_clear(&cur->drbg);
+  }
+  // Both locks are deliberately left locked so that any threads that are still
+  // running will hang if they try to call |RAND_bytes|.
+}
+#endif
+
+// rand_state_init seeds a |rand_state|.
+static void rand_state_init(struct rand_state *state) {
+  OPENSSL_memset(state, 0, sizeof(struct rand_state));
+  uint8_t seed[CTR_DRBG_ENTROPY_LEN];
+  rand_get_seed(state, seed);
+  if (!CTR_DRBG_init(&state->drbg, seed, NULL, 0)) {
+    abort();
+  }
+}
+
+// rand_state_get pops a |rand_state| from the head of
+// |rand_state_free_list| and returns it. If the list is empty, it
+// creates a fresh |rand_state| and returns that instead.
+static struct rand_state *rand_state_get(void) {
+  struct rand_state *state = NULL;
+  CRYPTO_STATIC_MUTEX_lock_write(rand_state_lock_bss_get());
+  state = *rand_state_free_list_bss_get();
+  if (state != NULL) {
+    *rand_state_free_list_bss_get() = state->next;
+  }
+  CRYPTO_STATIC_MUTEX_unlock_write(rand_state_lock_bss_get());
+
+  if (state != NULL) {
+    return state;
+  }
+
+  state = OPENSSL_malloc(sizeof(struct rand_state));
+  if (state == NULL) {
+    return NULL;
+  }
+
+  rand_state_init(state);
+
+#if defined(BORINGSSL_FIPS)
+  CRYPTO_STATIC_MUTEX_lock_write(rand_state_lock_bss_get());
+  state->next_all = *rand_state_all_list_bss_get();
+  *rand_state_all_list_bss_get() = state;
+  CRYPTO_STATIC_MUTEX_unlock_write(rand_state_lock_bss_get());
+#endif
+
+  return state;
+}
+
+// rand_state_put pushes |state| onto |rand_state_free_list|.
+static void rand_state_put(struct rand_state *state) {
+  CRYPTO_STATIC_MUTEX_lock_write(rand_state_lock_bss_get());
+  state->next = *rand_state_free_list_bss_get();
+  *rand_state_free_list_bss_get() = state;
+  CRYPTO_STATIC_MUTEX_unlock_write(rand_state_lock_bss_get());
+}
+
 void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len,
                                      const uint8_t user_additional_data[32]) {
   if (out_len == 0) {
@@ -259,41 +303,14 @@
     additional_data[i] ^= user_additional_data[i];
   }
 
-  struct rand_thread_state stack_state;
-  struct rand_thread_state *state =
-      CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_RAND);
+  struct rand_state stack_state;
+  struct rand_state *state = rand_state_get();
 
   if (state == NULL) {
-    state = OPENSSL_malloc(sizeof(struct rand_thread_state));
-    if (state == NULL ||
-        !CRYPTO_set_thread_local(OPENSSL_THREAD_LOCAL_RAND, state,
-                                 rand_thread_state_free)) {
-      // If the system is out of memory, use an ephemeral state on the
-      // stack.
-      state = &stack_state;
-    }
-
-    state->last_block_valid = 0;
-    uint8_t seed[CTR_DRBG_ENTROPY_LEN];
-    rand_get_seed(state, seed);
-    if (!CTR_DRBG_init(&state->drbg, seed, NULL, 0)) {
-      abort();
-    }
-    state->calls = 0;
-
-#if defined(BORINGSSL_FIPS)
-    if (state != &stack_state) {
-      CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get());
-      struct rand_thread_state **states_list = thread_states_list_bss_get();
-      state->next = *states_list;
-      if (state->next != NULL) {
-        state->next->prev = state;
-      }
-      state->prev = NULL;
-      *states_list = state;
-      CRYPTO_STATIC_MUTEX_unlock_write(thread_states_list_lock_bss_get());
-    }
-#endif
+    // If the system is out of memory, use an ephemeral state on the
+    // stack.
+    state = &stack_state;
+    rand_state_init(state);
   }
 
   if (state->calls >= kReseedInterval) {
@@ -302,13 +319,13 @@
 #if defined(BORINGSSL_FIPS)
     // Take a read lock around accesses to |state->drbg|. This is needed to
     // avoid returning bad entropy if we race with
-    // |rand_thread_state_clear_all|.
+    // |rand_state_clear_all|.
     //
     // This lock must be taken after any calls to |CRYPTO_sysrand| to avoid a
     // bug on ppc64le. glibc may implement pthread locks by wrapping user code
     // in a hardware transaction, but, on some older versions of glibc and the
     // kernel, syscalls made with |syscall| did not abort the transaction.
-    CRYPTO_STATIC_MUTEX_lock_read(thread_states_list_lock_bss_get());
+    CRYPTO_STATIC_MUTEX_lock_read(rand_drbg_lock_bss_get());
 #endif
     if (!CTR_DRBG_reseed(&state->drbg, seed, NULL, 0)) {
       abort();
@@ -316,7 +333,7 @@
     state->calls = 0;
   } else {
 #if defined(BORINGSSL_FIPS)
-    CRYPTO_STATIC_MUTEX_lock_read(thread_states_list_lock_bss_get());
+    CRYPTO_STATIC_MUTEX_lock_read(rand_drbg_lock_bss_get());
 #endif
   }
 
@@ -343,8 +360,12 @@
   }
 
 #if defined(BORINGSSL_FIPS)
-  CRYPTO_STATIC_MUTEX_unlock_read(thread_states_list_lock_bss_get());
+  CRYPTO_STATIC_MUTEX_unlock_read(rand_drbg_lock_bss_get());
 #endif
+
+  if (state != &stack_state) {
+    rand_state_put(state);
+  }
 }
 
 int RAND_bytes(uint8_t *out, size_t out_len) {
diff --git a/crypto/internal.h b/crypto/internal.h
index 3dde476..c4e2e51 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -528,7 +528,6 @@
 // stored.
 typedef enum {
   OPENSSL_THREAD_LOCAL_ERR = 0,
-  OPENSSL_THREAD_LOCAL_RAND,
   OPENSSL_THREAD_LOCAL_TEST,
   NUM_OPENSSL_THREAD_LOCALS,
 } thread_local_data_t;