Avoid locks in CRYPTO_free_ex_data

Every time we free a type with ex_data (RSA, EC_KEY, DSA, SSL_CTX, SSL,
SSL_SESSION, X509, X509_STORE), we allocate and take a read lock. The
allocation means, if we believe in malloc failures, it is possible to
leak memory on malloc failure. The read lock causes an unnecessary bit
of contention writing to the cache line.

Instead, since we never remove ex_data entries, just thread them in a
singly-linked list. This way we only need to synchronize when to stop
iterating. Add a counter to synchronize that. (Or we could make each
'next' pointers atomic, but this seemed more straightforward.)

(I suspect this doesn't matter much, but it was shorter and we were
already allocating the funcs structures anyway.)

Bug: 570
Change-Id: Ie7ba5cc44f2b71ebd79c8971e784912d53af7f5c
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/60025
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
diff --git a/crypto/ex_data.c b/crypto/ex_data.c
index 867ced3..d34769f 100644
--- a/crypto/ex_data.c
+++ b/crypto/ex_data.c
@@ -116,7 +116,6 @@
 #include <openssl/crypto.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/stack.h>
 #include <openssl/thread.h>
 
 #include "internal.h"
@@ -128,14 +127,14 @@
   long argl;   // Arbitary long
   void *argp;  // Arbitary void pointer
   CRYPTO_EX_free *free_func;
+  // next points to the next |CRYPTO_EX_DATA_FUNCS| or NULL if this is the last
+  // one. It may only be read if synchronized with a read from |num_funcs|.
+  CRYPTO_EX_DATA_FUNCS *next;
 };
 
 int CRYPTO_get_ex_new_index(CRYPTO_EX_DATA_CLASS *ex_data_class, int *out_index,
                             long argl, void *argp, CRYPTO_EX_free *free_func) {
-  CRYPTO_EX_DATA_FUNCS *funcs;
-  int ret = 0;
-
-  funcs = OPENSSL_malloc(sizeof(CRYPTO_EX_DATA_FUNCS));
+  CRYPTO_EX_DATA_FUNCS *funcs = OPENSSL_malloc(sizeof(CRYPTO_EX_DATA_FUNCS));
   if (funcs == NULL) {
     return 0;
   }
@@ -143,37 +142,32 @@
   funcs->argl = argl;
   funcs->argp = argp;
   funcs->free_func = free_func;
+  funcs->next = NULL;
 
   CRYPTO_STATIC_MUTEX_lock_write(&ex_data_class->lock);
 
-  if (ex_data_class->meth == NULL) {
-    ex_data_class->meth = sk_CRYPTO_EX_DATA_FUNCS_new_null();
-  }
-
-  if (ex_data_class->meth == NULL) {
-    goto err;
-  }
-
+  uint32_t num_funcs = CRYPTO_atomic_load_u32(&ex_data_class->num_funcs);
   // The index must fit in |int|.
-  if (sk_CRYPTO_EX_DATA_FUNCS_num(ex_data_class->meth) >
-          (size_t)(INT_MAX - ex_data_class->num_reserved)) {
+  if (num_funcs > (size_t)(INT_MAX - ex_data_class->num_reserved)) {
     OPENSSL_PUT_ERROR(CRYPTO, ERR_R_OVERFLOW);
-    goto err;
+    CRYPTO_STATIC_MUTEX_unlock_write(&ex_data_class->lock);
+    return 0;
   }
 
-  if (!sk_CRYPTO_EX_DATA_FUNCS_push(ex_data_class->meth, funcs)) {
-    goto err;
+  // Append |funcs| to the linked list.
+  if (ex_data_class->last == NULL) {
+    assert(num_funcs == 0);
+    ex_data_class->funcs = funcs;
+    ex_data_class->last = funcs;
+  } else {
+    ex_data_class->last->next = funcs;
+    ex_data_class->last = funcs;
   }
-  funcs = NULL;  // |sk_CRYPTO_EX_DATA_FUNCS_push| takes ownership.
 
-  *out_index = (int)sk_CRYPTO_EX_DATA_FUNCS_num(ex_data_class->meth) - 1 +
-               ex_data_class->num_reserved;
-  ret = 1;
-
-err:
+  CRYPTO_atomic_store_u32(&ex_data_class->num_funcs, num_funcs + 1);
   CRYPTO_STATIC_MUTEX_unlock_write(&ex_data_class->lock);
-  OPENSSL_free(funcs);
-  return ret;
+  *out_index = (int)num_funcs + ex_data_class->num_reserved;
+  return 1;
 }
 
 int CRYPTO_set_ex_data(CRYPTO_EX_DATA *ad, int index, void *val) {
@@ -209,33 +203,6 @@
   return sk_void_value(ad->sk, idx);
 }
 
-// get_func_pointers takes a copy of the CRYPTO_EX_DATA_FUNCS pointers, if any,
-// for the given class. If there are some pointers, it sets |*out| to point to
-// a fresh stack of them. Otherwise it sets |*out| to NULL. It returns one on
-// success or zero on error.
-static int get_func_pointers(STACK_OF(CRYPTO_EX_DATA_FUNCS) **out,
-                             CRYPTO_EX_DATA_CLASS *ex_data_class) {
-  size_t n;
-
-  *out = NULL;
-
-  // CRYPTO_EX_DATA_FUNCS structures are static once set, so we can take a
-  // shallow copy of the list under lock and then use the structures without
-  // the lock held.
-  CRYPTO_STATIC_MUTEX_lock_read(&ex_data_class->lock);
-  n = sk_CRYPTO_EX_DATA_FUNCS_num(ex_data_class->meth);
-  if (n > 0) {
-    *out = sk_CRYPTO_EX_DATA_FUNCS_dup(ex_data_class->meth);
-  }
-  CRYPTO_STATIC_MUTEX_unlock_read(&ex_data_class->lock);
-
-  if (n > 0 && *out == NULL) {
-    return 0;
-  }
-
-  return 1;
-}
-
 void CRYPTO_new_ex_data(CRYPTO_EX_DATA *ad) {
   ad->sk = NULL;
 }
@@ -247,26 +214,21 @@
     return;
   }
 
-  STACK_OF(CRYPTO_EX_DATA_FUNCS) *func_pointers;
-  if (!get_func_pointers(&func_pointers, ex_data_class)) {
-    // TODO(davidben): This leaks memory on malloc error.
-    return;
-  }
-
+  uint32_t num_funcs = CRYPTO_atomic_load_u32(&ex_data_class->num_funcs);
   // |CRYPTO_get_ex_new_index| will not allocate indices beyond |INT_MAX|.
-  assert(sk_CRYPTO_EX_DATA_FUNCS_num(func_pointers) <=
-         (size_t)(INT_MAX - ex_data_class->num_reserved));
-  for (int i = 0; i < (int)sk_CRYPTO_EX_DATA_FUNCS_num(func_pointers); i++) {
-    CRYPTO_EX_DATA_FUNCS *func_pointer =
-        sk_CRYPTO_EX_DATA_FUNCS_value(func_pointers, i);
-    if (func_pointer->free_func) {
-      void *ptr = CRYPTO_get_ex_data(ad, i + ex_data_class->num_reserved);
-      func_pointer->free_func(obj, ptr, ad, i + ex_data_class->num_reserved,
-                              func_pointer->argl, func_pointer->argp);
-    }
-  }
+  assert(num_funcs <= (size_t)(INT_MAX - ex_data_class->num_reserved));
 
-  sk_CRYPTO_EX_DATA_FUNCS_free(func_pointers);
+  // Defer dereferencing |ex_data_class->funcs| and |funcs->next|. It must come
+  // after the |num_funcs| comparison to be correctly synchronized.
+  CRYPTO_EX_DATA_FUNCS *const *funcs = &ex_data_class->funcs;
+  for (uint32_t i = 0; i < num_funcs; i++) {
+    if ((*funcs)->free_func != NULL) {
+      int index = (int)i + ex_data_class->num_reserved;
+      void *ptr = CRYPTO_get_ex_data(ad, index);
+      (*funcs)->free_func(obj, ptr, ad, index, (*funcs)->argl, (*funcs)->argp);
+    }
+    funcs = &(*funcs)->next;
+  }
 
   sk_void_free(ad->sk);
   ad->sk = NULL;
diff --git a/crypto/internal.h b/crypto/internal.h
index 4b7d82c..d15f753 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -850,22 +850,25 @@
 
 typedef struct crypto_ex_data_func_st CRYPTO_EX_DATA_FUNCS;
 
-DECLARE_STACK_OF(CRYPTO_EX_DATA_FUNCS)
-
 // CRYPTO_EX_DATA_CLASS tracks the ex_indices registered for a type which
 // supports ex_data. It should defined as a static global within the module
 // which defines that type.
 typedef struct {
   struct CRYPTO_STATIC_MUTEX lock;
-  STACK_OF(CRYPTO_EX_DATA_FUNCS) *meth;
+  // funcs is a linked list of |CRYPTO_EX_DATA_FUNCS| structures. It may be
+  // traversed without serialization only up to |num_funcs|. last points to the
+  // final entry of |funcs|, or NULL if empty.
+  CRYPTO_EX_DATA_FUNCS *funcs, *last;
+  // num_funcs is the number of entries in |funcs|.
+  CRYPTO_atomic_u32 num_funcs;
   // num_reserved is one if the ex_data index zero is reserved for legacy
   // |TYPE_get_app_data| functions.
   uint8_t num_reserved;
 } CRYPTO_EX_DATA_CLASS;
 
-#define CRYPTO_EX_DATA_CLASS_INIT {CRYPTO_STATIC_MUTEX_INIT, NULL, 0}
+#define CRYPTO_EX_DATA_CLASS_INIT {CRYPTO_STATIC_MUTEX_INIT, NULL, NULL, 0, 0}
 #define CRYPTO_EX_DATA_CLASS_INIT_WITH_APP_DATA \
-    {CRYPTO_STATIC_MUTEX_INIT, NULL, 1}
+    {CRYPTO_STATIC_MUTEX_INIT, NULL, NULL, 0, 1}
 
 // CRYPTO_get_ex_new_index allocates a new index for |ex_data_class| and writes
 // it to |*out_index|. Each class of object should provide a wrapper function