Support detecting preference for ymm registers over zmm

Add a CPU capability bit that identifies older Intel CPUs that support
AVX512 but where using zmm registers should be avoided.  This will be
used to select code that uses ymm registers instead.

Change-Id: I6bedc913960d0da3c5f3aae315c81f67da1667b4
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/68909
Reviewed-by: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: Bob Beck <bbe@google.com>
diff --git a/crypto/cpu_intel.c b/crypto/cpu_intel.c
index 10e7871..7193493 100644
--- a/crypto/cpu_intel.c
+++ b/crypto/cpu_intel.c
@@ -173,20 +173,21 @@
 
   OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1);
 
+  const uint32_t base_family = (eax >> 8) & 15;
+  const uint32_t base_model = (eax >> 4) & 15;
+
+  uint32_t family = base_family;
+  uint32_t model = base_model;
+  if (base_family == 15) {
+    const uint32_t ext_family = (eax >> 20) & 255;
+    family += ext_family;
+  }
+  if (base_family == 6 || base_family == 15) {
+    const uint32_t ext_model = (eax >> 16) & 15;
+    model |= ext_model << 4;
+  }
+
   if (is_amd) {
-    // See https://www.amd.com/system/files/TechDocs/25481.pdf, page 10.
-    const uint32_t base_family = (eax >> 8) & 15;
-    const uint32_t base_model = (eax >> 4) & 15;
-
-    uint32_t family = base_family;
-    uint32_t model = base_model;
-    if (base_family == 0xf) {
-      const uint32_t ext_family = (eax >> 20) & 255;
-      family += ext_family;
-      const uint32_t ext_model = (eax >> 16) & 15;
-      model |= ext_model << 4;
-    }
-
     if (family < 0x17 || (family == 0x17 && 0x70 <= model && model <= 0x7f)) {
       // Disable RDRAND on AMD families before 0x17 (Zen) due to reported
       // failures after suspend.
@@ -264,6 +265,32 @@
     extended_features[1] &= ~(1u << 14);  // AVX512VPOPCNTDQ
   }
 
+  // Repurpose the bit for the removed MPX feature to indicate when using zmm
+  // registers should be avoided even when they are supported. (When set, AVX512
+  // features can still be used, but only using ymm or xmm registers.) Skylake
+  // suffered from severe downclocking when zmm registers were used, which
+  // affected unrelated code running on the system, making zmm registers not too
+  // useful outside of benchmarks. The situation improved significantly by Ice
+  // Lake, but a small amount of downclocking remained. (See
+  // https://lore.kernel.org/linux-crypto/e8ce1146-3952-6977-1d0e-a22758e58914@intel.com/)
+  // We take a conservative approach of not allowing zmm registers until after
+  // Ice Lake and Tiger Lake, i.e. until Sapphire Rapids on the server side.
+  //
+  // AMD CPUs, which support AVX512 starting with Zen 4, have not been reported
+  // to have any downclocking problem when zmm registers are used.
+  if (is_intel && family == 6 &&
+      (model == 85 ||    // Skylake, Cascade Lake, Cooper Lake (server)
+       model == 106 ||   // Ice Lake (server)
+       model == 108 ||   // Ice Lake (micro server)
+       model == 125 ||   // Ice Lake (client)
+       model == 126 ||   // Ice Lake (mobile)
+       model == 140 ||   // Tiger Lake (mobile)
+       model == 141)) {  // Tiger Lake (client)
+    extended_features[0] |= 1u << 14;
+  } else {
+    extended_features[0] &= ~(1u << 14);
+  }
+
   OPENSSL_ia32cap_P[0] = edx;
   OPENSSL_ia32cap_P[1] = ecx;
   OPENSSL_ia32cap_P[2] = extended_features[0];
diff --git a/crypto/internal.h b/crypto/internal.h
index 4ec12f5..5ca29ae 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -1391,6 +1391,8 @@
 //     Bit 11 is used to indicate AMD XOP support, not SDBG
 //   Index 2:
 //     EBX for CPUID where EAX = 7, ECX = 0
+//     Bit 14 (for removed feature MPX) is used to indicate a preference for ymm
+//       registers over zmm even when zmm registers are supported
 //   Index 3:
 //     ECX for CPUID where EAX = 7, ECX = 0
 //
@@ -1580,6 +1582,14 @@
 #endif
 }
 
+// CRYPTO_cpu_avoid_zmm_registers returns 1 if zmm registers (512-bit vectors)
+// should not be used even if the CPU supports them.
+//
+// Note that this reuses the bit for the removed MPX feature.
+OPENSSL_INLINE int CRYPTO_cpu_avoid_zmm_registers(void) {
+  return (OPENSSL_get_ia32cap(2) & (1u << 14)) != 0;
+}
+
 OPENSSL_INLINE int CRYPTO_is_VAES_capable(void) {
 #if defined(__VAES__)
   return 1;