Use Windows Interlocked* APIs for refcounts when C11 isn't available Right now, MSVC has to fallback to refcount_lock.c, which uses a single, global lock for all refcount operations. Instead, use the Interlocked* APIs to implement them. The motivation is two-fold. First, this removes a performance cliff when building for Windows on a non-Clang compiler. (Although I've not been able to measure it in an end-to-end EVP benchmark, only a synthetic refcount-only benchmark.) More importantly, it gets us closer to assuming atomics support on all non-NO_THREADS configurations. (The next CL will clear through that.) That, in turn, will make it easier to add an atomics-like abstractions to some of our hotter synchronization points. (Even in newer glibc, with its better rwlock, read locks fundamentally need to write to memory, so we have some cacheline contention on shared locks.) Annoyingly, the Windows atomic_load replacement is not quite right. I've used a "no-op" InterlockedCompareExchange(p, 0, 0) which, empirically, still results in a write. But a write to the refcount cacheline is surely better than taking a global exclusive lock. See comments in file for details. OpenSSL uses InterlockedOr(p, 0), but that actually results in even worse code. (InterlockedOr needs a retry loop when the underlying cmpxchg fails, whereas InterlockedCompareExchange is a single cmpxchg.) Hopefully, in the future (perhaps when we require VS 2022's successor, based on [1]), this can be removed in favor of C11 atomics everywhere. [1] https://devblogs.microsoft.com/cppblog/c11-atomics-in-visual-studio-2022-version-17-5-preview-2/ Bug: 570 Cq-Include-Trybots: luci.boringssl.try:linux_clang_rel_tsan Change-Id: I125da139e2fd3ae51e54309309fda16ba97ccf20 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/59846 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>

commit: 5b845de636224ef3e065be8e1c7d2df3389aa175 [log] [tgz]
author: David Benjamin <davidben@google.com> Sat Jan 07 23:21:52 2023 -0800
committer: Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com> Tue May 16 18:53:51 2023 +0000
tree: ac11056d78e1be7bf38ab48fa94e44226a99ce0b
parent: dd9ee6068667ca58c8d6f1c1cea617fd69452ecf [diff]
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index bc30702..12d15a8 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt

@@ -204,6 +204,7 @@
   rc4/rc4.c
   refcount_c11.c
   refcount_lock.c
+  refcount_win.c
   rsa_extra/rsa_asn1.c
   rsa_extra/rsa_crypt.c
   rsa_extra/rsa_print.c

diff --git a/crypto/internal.h b/crypto/internal.h
index a4cd929..adcd444 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h

@@ -548,6 +548,14 @@
 #define OPENSSL_C11_ATOMIC
 #endif
 
+// Older MSVC does not support C11 atomics, so we fallback to the Windows APIs.
+// This can be removed once we can rely on
+// https://devblogs.microsoft.com/cppblog/c11-atomics-in-visual-studio-2022-version-17-5-preview-2/
+#if !defined(OPENSSL_C11_ATOMIC) && defined(OPENSSL_THREADS) && \
+    defined(OPENSSL_WINDOWS)
+#define OPENSSL_WINDOWS_ATOMIC
+#endif
+
 // CRYPTO_REFCOUNT_MAX is the value at which the reference count saturates.
 #define CRYPTO_REFCOUNT_MAX 0xffffffff
 

diff --git a/crypto/refcount_lock.c b/crypto/refcount_lock.c
index 173267e..7886bf8 100644
--- a/crypto/refcount_lock.c
+++ b/crypto/refcount_lock.c

@@ -18,7 +18,7 @@
 #include <stdlib.h>
 
 
-#if !defined(OPENSSL_C11_ATOMIC)
+#if !defined(OPENSSL_C11_ATOMIC) && !defined(OPENSSL_WINDOWS_ATOMIC)
 
 static_assert((CRYPTO_refcount_t)-1 == CRYPTO_REFCOUNT_MAX,
               "CRYPTO_REFCOUNT_MAX is incorrect");
@@ -49,4 +49,4 @@
   return ret;
 }
 
-#endif  // OPENSSL_C11_ATOMIC
+#endif  // !OPENSSL_C11_ATOMIC && !OPENSSL_WINDOWS_ATOMICS

diff --git a/crypto/refcount_win.c b/crypto/refcount_win.c
new file mode 100644
index 0000000..7a2740b
--- /dev/null
+++ b/crypto/refcount_win.c

@@ -0,0 +1,89 @@
+/* Copyright (c) 2023, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include "internal.h"
+
+#if defined(OPENSSL_WINDOWS_ATOMIC)
+
+#include <windows.h>
+
+
+// See comment above the typedef of CRYPTO_refcount_t about these tests.
+static_assert(alignof(CRYPTO_refcount_t) == alignof(LONG),
+              "CRYPTO_refcount_t does not match LONG alignment");
+static_assert(sizeof(CRYPTO_refcount_t) == sizeof(LONG),
+              "CRYPTO_refcount_t does not match LONG size");
+
+static_assert((CRYPTO_refcount_t)-1 == CRYPTO_REFCOUNT_MAX,
+              "CRYPTO_REFCOUNT_MAX is incorrect");
+
+static uint32_t atomic_load_u32(volatile LONG *ptr) {
+  // This is not ideal because it still writes to a cacheline. MSVC is not able
+  // to optimize this to a true atomic read, and Windows does not provide an
+  // InterlockedLoad function.
+  //
+  // The Windows documentation [1] does say "Simple reads and writes to
+  // properly-aligned 32-bit variables are atomic operations", but this is not
+  // phrased in terms of the C11 and C++11 memory models, and indeed a read or
+  // write seems to produce slightly different code on MSVC than a sequentially
+  // consistent std::atomic::load in C++. Moreover, it is unclear if non-MSVC
+  // compilers on Windows provide the same guarantees. Thus we avoid relying on
+  // this and instead still use an interlocked function. This is still
+  // preferable a global mutex, and eventually this code will be replaced by
+  // [2]. Additionally, on clang-cl, we'll use the |OPENSSL_C11_ATOMIC| path.
+  //
+  // [1] https://learn.microsoft.com/en-us/windows/win32/sync/interlocked-variable-access
+  // [2] https://devblogs.microsoft.com/cppblog/c11-atomics-in-visual-studio-2022-version-17-5-preview-2/
+  return (uint32_t)InterlockedCompareExchange(ptr, 0, 0);
+}
+
+static int atomic_compare_exchange_u32(volatile LONG *ptr, uint32_t *expected32,
+                                       uint32_t desired) {
+  LONG expected = (LONG)*expected32;
+  LONG actual = InterlockedCompareExchange(ptr, (LONG)desired, expected);
+  *expected32 = (uint32_t)actual;
+  return actual == expected;
+}
+
+void CRYPTO_refcount_inc(CRYPTO_refcount_t *in_count) {
+  volatile LONG *count = (volatile LONG *)in_count;
+  uint32_t expected = atomic_load_u32(count);
+
+  while (expected != CRYPTO_REFCOUNT_MAX) {
+    const uint32_t new_value = expected + 1;
+    if (atomic_compare_exchange_u32(count, &expected, new_value)) {
+      break;
+    }
+  }
+}
+
+int CRYPTO_refcount_dec_and_test_zero(CRYPTO_refcount_t *in_count) {
+  volatile LONG *count = (volatile LONG *)in_count;
+  uint32_t expected = atomic_load_u32(count);
+
+  for (;;) {
+    if (expected == 0) {
+      abort();
+    } else if (expected == CRYPTO_REFCOUNT_MAX) {
+      return 0;
+    } else {
+      const uint32_t new_value = expected - 1;
+      if (atomic_compare_exchange_u32(count, &expected, new_value)) {
+        return new_value == 0;
+      }
+    }
+  }
+}
+
+#endif  // OPENSSL_WINDOWS_ATOMIC
commit	5b845de636224ef3e065be8e1c7d2df3389aa175	[log] [tgz]
author	David Benjamin <davidben@google.com>	Sat Jan 07 23:21:52 2023 -0800
committer	Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com>	Tue May 16 18:53:51 2023 +0000
tree	ac11056d78e1be7bf38ab48fa94e44226a99ce0b
parent	dd9ee6068667ca58c8d6f1c1cea617fd69452ecf [diff]