Help the compiler dedup ia32cap and armcap accesses

https://boringssl-review.googlesource.com/c/boringssl/+/62585 made the
compiler emit multiple CRYPTO_library_init calls in functions which
dispatch between a tower of alternatives. Ideally, the compiler would
know that at most one call suffices.

There doesn't seem to be such an attribute, but we can get the same
effect with pure or const attributes. We tie init with returning the
capability vector. On Intel, because the vector is so large, we have to
go with a weaker version. Somewhat annoyingly, the getter must be
out-of-line, because otherwise the compiler inlines first and loses the
attribute.

I went with pure because we allow our unit tests to mutate
OPENSSL_armcap_P, which means the Arm one is, strictly speaking, pure,
not const. This slightly reduces optimization potential, but should
still allow deduping in most places. Confirmed that aes_init_key
now only calls a helper function once.

See discussion in
https://boringssl-review.googlesource.com/c/boringssl/+/62585/comment/26083b88_b3db2b75/

Bug: 35
Change-Id: I9bc464f0e5a0ed9601017a5037028f906693a137
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/62985
Auto-Submit: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: Bob Beck <bbe@google.com>
diff --git a/crypto/crypto.c b/crypto/crypto.c
index beaae0f..0bca1e2 100644
--- a/crypto/crypto.c
+++ b/crypto/crypto.c
@@ -78,6 +78,11 @@
 // This value must be explicitly initialized to zero. See similar comment above.
 HIDDEN uint32_t OPENSSL_ia32cap_P[4] = {0};
 
+uint32_t OPENSSL_get_ia32cap(int idx) {
+  CRYPTO_library_init();
+  return OPENSSL_ia32cap_P[idx];
+}
+
 #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
 
 #include <openssl/arm_arch.h>
@@ -116,10 +121,16 @@
 HIDDEN uint32_t OPENSSL_armcap_P = 0;
 
 uint32_t *OPENSSL_get_armcap_pointer_for_test(void) {
+  CRYPTO_library_init();
   return &OPENSSL_armcap_P;
 }
 #endif
 
+uint32_t OPENSSL_get_armcap(void) {
+  CRYPTO_library_init();
+  return OPENSSL_armcap_P;
+}
+
 #endif
 
 #if defined(BORINGSSL_FIPS)
diff --git a/crypto/internal.h b/crypto/internal.h
index 15561fa..a2c49d6 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -255,6 +255,12 @@
 #define OPENSSL_SSE2
 #endif
 
+#if defined(__GNUC__) || defined(__clang__)
+#define OPENSSL_ATTR_PURE __attribute__((pure))
+#else
+#define OPENSSL_ATTR_PURE
+#endif
+
 #if defined(BORINGSSL_MALLOC_FAILURE_TESTING)
 // OPENSSL_reset_malloc_counter_for_testing, when malloc testing is enabled,
 // resets the internal malloc counter, to simulate further malloc failures. This
@@ -1228,23 +1234,14 @@
 //
 // Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the YMM and XMM
 // bits in XCR0, so it is not necessary to check those.
+//
+// From C, this symbol should only be accessed with |OPENSSL_get_ia32cap|.
 extern uint32_t OPENSSL_ia32cap_P[4];
 
-#if defined(BORINGSSL_FIPS) && !defined(BORINGSSL_SHARED_LIBRARY)
-// The FIPS module, as a static library, requires an out-of-line version of
-// |OPENSSL_ia32cap_get| so accesses can be rewritten by delocate. Mark the
-// function const so multiple accesses can be optimized together.
-const uint32_t *OPENSSL_ia32cap_get(void) __attribute__((const));
-#else
-OPENSSL_INLINE const uint32_t *OPENSSL_ia32cap_get(void) {
-  return OPENSSL_ia32cap_P;
-}
-#endif
-
-OPENSSL_INLINE int OPENSSL_ia32cap_has_bit(int idx, int bit) {
-  CRYPTO_library_init();
-  return (OPENSSL_ia32cap_get()[idx] & (1u << bit)) != 0;
-}
+// OPENSSL_get_ia32cap initializes the library if needed and returns the |idx|th
+// entry of |OPENSSL_ia32cap_P|. It is marked as a pure function so duplicate
+// calls can be merged by the compiler, at least when indices match.
+OPENSSL_ATTR_PURE uint32_t OPENSSL_get_ia32cap(int idx);
 
 // See Intel manual, volume 2A, table 3-11.
 
@@ -1252,13 +1249,13 @@
 #if defined(__FXSR__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/0, /*bit=*/24);
+  return (OPENSSL_get_ia32cap(0) & (1u << 24)) != 0;
 #endif
 }
 
 OPENSSL_INLINE int CRYPTO_is_intel_cpu(void) {
   // The reserved bit 30 is used to indicate an Intel CPU.
-  return OPENSSL_ia32cap_has_bit(/*idx=*/0, /*bit=*/30);
+  return (OPENSSL_get_ia32cap(0) & (1u << 30)) != 0;
 }
 
 // See Intel manual, volume 2A, table 3-10.
@@ -1267,7 +1264,7 @@
 #if defined(__PCLMUL__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/1, /*bit=*/1);
+  return (OPENSSL_get_ia32cap(1) & (1u << 1)) != 0;
 #endif
 }
 
@@ -1275,7 +1272,7 @@
 #if defined(__SSSE3__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/1, /*bit=*/9);
+  return (OPENSSL_get_ia32cap(1) & (1u << 9)) != 0;
 #endif
 }
 
@@ -1283,7 +1280,7 @@
 #if defined(__SSE4_1__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/1, /*bit=*/19);
+  return (OPENSSL_get_ia32cap(1) & (1u << 19)) != 0;
 #endif
 }
 
@@ -1291,7 +1288,7 @@
 #if defined(__MOVBE__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/1, /*bit=*/22);
+  return (OPENSSL_get_ia32cap(1) & (1u << 22)) != 0;
 #endif
 }
 
@@ -1299,7 +1296,7 @@
 #if defined(__AES__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/1, /*bit=*/25);
+  return (OPENSSL_get_ia32cap(1) & (1u << 25)) != 0;
 #endif
 }
 
@@ -1307,7 +1304,7 @@
 #if defined(__AVX__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/1, /*bit=*/28);
+  return (OPENSSL_get_ia32cap(1) & (1u << 28)) != 0;
 #endif
 }
 
@@ -1317,7 +1314,7 @@
 #if defined(__RDRND__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/1, /*bit=*/30);
+  return (OPENSSL_get_ia32cap(1) & (1u << 30)) != 0;
 #endif
 }
 
@@ -1327,7 +1324,7 @@
 #if defined(__BMI1__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/2, /*bit=*/3);
+  return (OPENSSL_get_ia32cap(2) & (1u << 3)) != 0;
 #endif
 }
 
@@ -1335,7 +1332,7 @@
 #if defined(__AVX2__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/2, /*bit=*/5);
+  return (OPENSSL_get_ia32cap(2) & (1u << 5)) != 0;
 #endif
 }
 
@@ -1343,7 +1340,7 @@
 #if defined(__BMI2__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/2, /*bit=*/8);
+  return (OPENSSL_get_ia32cap(2) & (1u << 8)) != 0;
 #endif
 }
 
@@ -1351,7 +1348,7 @@
 #if defined(__ADX__)
   return 1;
 #else
-  return OPENSSL_ia32cap_has_bit(/*idx=*/2, /*bit=*/19);
+  return (OPENSSL_get_ia32cap(2) & (1u << 19)) != 0;
 #endif
 }
 
@@ -1359,12 +1356,14 @@
 
 #if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
 
+// OPENSSL_armcap_P contains ARM CPU capabilities. From C, this should only be
+// accessed with |OPENSSL_get_armcap|.
 extern uint32_t OPENSSL_armcap_P;
 
-OPENSSL_INLINE uint32_t OPENSSL_get_armcap(void) {
-  CRYPTO_library_init();
-  return OPENSSL_armcap_P;
-}
+// OPENSSL_get_armcap initializes the library if needed and returns ARM CPU
+// capabilities. It is marked as a pure function so duplicate calls can be
+// merged by the compiler, at least when indices match.
+OPENSSL_ATTR_PURE uint32_t OPENSSL_get_armcap(void);
 
 // We do not detect any features at runtime on several 32-bit Arm platforms.
 // Apple platforms and OpenBSD require NEON and moved to 64-bit to pick up Armv8