diff --git a/crypto/fipsmodule/rand/fork_detect.c b/crypto/fipsmodule/rand/fork_detect.c
index 58b0687..9e46223 100644
--- a/crypto/fipsmodule/rand/fork_detect.c
+++ b/crypto/fipsmodule/rand/fork_detect.c
@@ -38,7 +38,7 @@
 
 DEFINE_STATIC_ONCE(g_fork_detect_once);
 DEFINE_STATIC_MUTEX(g_fork_detect_lock);
-DEFINE_BSS_GET(volatile char *, g_fork_detect_addr);
+DEFINE_BSS_GET(CRYPTO_atomic_u32 *, g_fork_detect_addr);
 DEFINE_BSS_GET(uint64_t, g_fork_generation);
 DEFINE_BSS_GET(int, g_force_madv_wipeonfork);
 DEFINE_BSS_GET(int, g_force_madv_wipeonfork_enabled);
@@ -70,7 +70,7 @@
     return;
   }
 
-  *((volatile char *) addr) = 1;
+  CRYPTO_atomic_store_u32(addr, 1);
   *g_fork_detect_addr_bss_get() = addr;
   *g_fork_generation_bss_get() = 1;
 }
@@ -83,16 +83,12 @@
   // is initialised atomically, even if multiple threads enter this function
   // concurrently.
   //
-  // In the limit, the kernel may clear WIPEONFORK pages while a multi-threaded
-  // process is running. (For example, because a VM was cloned.) Therefore a
-  // lock is used below to synchronise the potentially multiple threads that may
-  // concurrently observe the cleared flag.
+  // Additionally, while the kernel will only clear WIPEONFORK at a point when a
+  // child process is single-threaded, the child may become multi-threaded
+  // before it observes this. Therefore, we must synchronize the logic below.
 
   CRYPTO_once(g_fork_detect_once_bss_get(), init_fork_detect);
-  // This pointer is |volatile| because the value pointed to may be changed by
-  // external forces (i.e. the kernel wiping the page) thus the compiler must
-  // not assume that it has exclusive access to it.
-  volatile char *const flag_ptr = *g_fork_detect_addr_bss_get();
+  CRYPTO_atomic_u32 *const flag_ptr = *g_fork_detect_addr_bss_get();
   if (flag_ptr == NULL) {
     // Our kernel is too old to support |MADV_WIPEONFORK| or
     // |g_force_madv_wipeonfork| is set.
@@ -105,28 +101,34 @@
     return 0;
   }
 
-  struct CRYPTO_STATIC_MUTEX *const lock = g_fork_detect_lock_bss_get();
+  // In the common case, try to observe the flag without taking a lock. This
+  // avoids cacheline contention in the PRNG.
   uint64_t *const generation_ptr = g_fork_generation_bss_get();
-
-  CRYPTO_STATIC_MUTEX_lock_read(lock);
-  uint64_t current_generation = *generation_ptr;
-  if (*flag_ptr) {
-    CRYPTO_STATIC_MUTEX_unlock_read(lock);
-    return current_generation;
+  if (CRYPTO_atomic_load_u32(flag_ptr) != 0) {
+    // If we observe a non-zero flag, it is safe to read |generation_ptr|
+    // without a lock. The flag and generation number are fixed for this copy of
+    // the address space.
+    return *generation_ptr;
   }
 
-  CRYPTO_STATIC_MUTEX_unlock_read(lock);
+  // The flag was zero. The generation number must be incremented, but other
+  // threads may have concurrently observed the zero, so take a lock before
+  // incrementing.
+  struct CRYPTO_STATIC_MUTEX *const lock = g_fork_detect_lock_bss_get();
   CRYPTO_STATIC_MUTEX_lock_write(lock);
-  current_generation = *generation_ptr;
-  if (*flag_ptr == 0) {
+  uint64_t current_generation = *generation_ptr;
+  if (CRYPTO_atomic_load_u32(flag_ptr) == 0) {
     // A fork has occurred.
-    *flag_ptr = 1;
-
     current_generation++;
     if (current_generation == 0) {
+      // Zero means fork detection isn't supported, so skip that value.
       current_generation = 1;
     }
+
+    // We must update |generation_ptr| before |flag_ptr|. Other threads may
+    // observe |flag_ptr| without taking a lock.
     *generation_ptr = current_generation;
+    CRYPTO_atomic_store_u32(flag_ptr, 1);
   }
   CRYPTO_STATIC_MUTEX_unlock_write(lock);
 
diff --git a/crypto/fipsmodule/rand/rand.c b/crypto/fipsmodule/rand/rand.c
index 0ead182..bf6b046 100644
--- a/crypto/fipsmodule/rand/rand.c
+++ b/crypto/fipsmodule/rand/rand.c
@@ -72,6 +72,10 @@
   // next and prev form a NULL-terminated, double-linked list of all states in
   // a process.
   struct rand_thread_state *next, *prev;
+  // clear_drbg_lock synchronizes between uses of |drbg| and
+  // |rand_thread_state_clear_all| clearing it. This lock should be uncontended
+  // in the common case, except on shutdown.
+  CRYPTO_MUTEX clear_drbg_lock;
 #endif
 };
 
@@ -82,18 +86,19 @@
 // called when the whole process is exiting.
 DEFINE_BSS_GET(struct rand_thread_state *, thread_states_list);
 DEFINE_STATIC_MUTEX(thread_states_list_lock);
-DEFINE_STATIC_MUTEX(state_clear_all_lock);
 
 static void rand_thread_state_clear_all(void) __attribute__((destructor));
 static void rand_thread_state_clear_all(void) {
   CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get());
-  CRYPTO_STATIC_MUTEX_lock_write(state_clear_all_lock_bss_get());
   for (struct rand_thread_state *cur = *thread_states_list_bss_get();
        cur != NULL; cur = cur->next) {
+    CRYPTO_MUTEX_lock_write(&cur->clear_drbg_lock);
     CTR_DRBG_clear(&cur->drbg);
   }
   // The locks are deliberately left locked so that any threads that are still
-  // running will hang if they try to call |RAND_bytes|.
+  // running will hang if they try to call |RAND_bytes|. It also ensures
+  // |rand_thread_state_free| cannot free any thread state while we've taken the
+  // lock.
 }
 #endif
 
@@ -385,6 +390,7 @@
     state->fork_generation = fork_generation;
 
 #if defined(BORINGSSL_FIPS)
+    CRYPTO_MUTEX_init(&state->clear_drbg_lock);
     if (state != &stack_state) {
       CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get());
       struct rand_thread_state **states_list = thread_states_list_bss_get();
@@ -410,7 +416,7 @@
     // Take a read lock around accesses to |state->drbg|. This is needed to
     // avoid returning bad entropy if we race with
     // |rand_thread_state_clear_all|.
-    CRYPTO_STATIC_MUTEX_lock_read(state_clear_all_lock_bss_get());
+    CRYPTO_MUTEX_lock_read(&state->clear_drbg_lock);
 #endif
     if (!CTR_DRBG_reseed(&state->drbg, seed, reseed_additional_data,
                          reseed_additional_data_len)) {
@@ -420,7 +426,7 @@
     state->fork_generation = fork_generation;
   } else {
 #if defined(BORINGSSL_FIPS)
-    CRYPTO_STATIC_MUTEX_lock_read(state_clear_all_lock_bss_get());
+    CRYPTO_MUTEX_lock_read(&state->clear_drbg_lock);
 #endif
   }
 
@@ -449,7 +455,7 @@
   }
 
 #if defined(BORINGSSL_FIPS)
-  CRYPTO_STATIC_MUTEX_unlock_read(state_clear_all_lock_bss_get());
+  CRYPTO_MUTEX_unlock_read(&state->clear_drbg_lock);
 #endif
 }
 
diff --git a/crypto/internal.h b/crypto/internal.h
index 00f0582..9edfd0e 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -594,6 +594,11 @@
   return atomic_compare_exchange_weak(val, expected, desired);
 }
 
+OPENSSL_INLINE void CRYPTO_atomic_store_u32(CRYPTO_atomic_u32 *val,
+                                            uint32_t desired) {
+  atomic_store(val, desired);
+}
+
 #elif defined(OPENSSL_WINDOWS_ATOMIC)
 
 typedef LONG CRYPTO_atomic_u32;
@@ -626,6 +631,11 @@
   return actual == expected;
 }
 
+OPENSSL_INLINE void CRYPTO_atomic_store_u32(volatile CRYPTO_atomic_u32 *val,
+                                            uint32_t desired) {
+  InterlockedExchange(val, (LONG)desired);
+}
+
 #elif !defined(OPENSSL_THREADS)
 
 typedef uint32_t CRYPTO_atomic_u32;
@@ -644,6 +654,11 @@
   return 1;
 }
 
+OPENSSL_INLINE void CRYPTO_atomic_store_u32(CRYPTO_atomic_u32 *val,
+                                            uint32_t desired) {
+  *val = desired;
+}
+
 #else
 
 // Require some atomics implementation. Contact BoringSSL maintainers if you
diff --git a/crypto/rand_extra/forkunsafe.c b/crypto/rand_extra/forkunsafe.c
index 0f1ecec..356afdd 100644
--- a/crypto/rand_extra/forkunsafe.c
+++ b/crypto/rand_extra/forkunsafe.c
@@ -17,13 +17,12 @@
 #include <stdlib.h>
 
 #include "../fipsmodule/rand/internal.h"
+#include "../internal.h"
 
 
-// g_buffering_enabled is true if fork-unsafe buffering has been enabled.
-static int g_buffering_enabled = 0;
-
-// g_lock protects |g_buffering_enabled|.
-static struct CRYPTO_STATIC_MUTEX g_lock = CRYPTO_STATIC_MUTEX_INIT;
+// g_buffering_enabled is one if fork-unsafe buffering has been enabled and zero
+// otherwise.
+static CRYPTO_atomic_u32 g_buffering_enabled = 0;
 
 #if !defined(OPENSSL_WINDOWS)
 void RAND_enable_fork_unsafe_buffering(int fd) {
@@ -32,15 +31,10 @@
     abort();
   }
 
-  CRYPTO_STATIC_MUTEX_lock_write(&g_lock);
-  g_buffering_enabled = 1;
-  CRYPTO_STATIC_MUTEX_unlock_write(&g_lock);
+  CRYPTO_atomic_store_u32(&g_buffering_enabled, 1);
 }
 #endif
 
 int rand_fork_unsafe_buffering_enabled(void) {
-  CRYPTO_STATIC_MUTEX_lock_read(&g_lock);
-  const int ret = g_buffering_enabled;
-  CRYPTO_STATIC_MUTEX_unlock_read(&g_lock);
-  return ret;
+  return CRYPTO_atomic_load_u32(&g_buffering_enabled) != 0;
 }
