Only draw from RDRAND for additional_data if it's fast.

We seek to incorporate entropy into every |RAND_bytes| call to avoid
problems with fork() and VM cloning. However, on some chips, RDRAND is
significantly slower than a system call thus crushing the performance of
|RAND_bytes|.

This change disables use of RDRAND for this opportunistic draw for
non-Intel chips. BoringSSL will then fall back to either the OS, or
nothing (if fork-unsafe mode has been set).

RDRAND is still used for seeding the PRNG whenever it is available.

This now adds a new blocking case: RDRAND may be used for seeding, but
the syscall to get additional_data was still blocking. Previously that
didn't matter because, if a syscall was used to get additional_data,
then a blocking one had already been used to seed. Thus the syscall for
additional_data is now non-blocking.

Also, we had both |hwrand| and |rdrand| names hanging around. We don't
support entropy instructions other than RDRAND, so unify around |rdrand|
naming. If we ever do add support for more we can properly abstract at
that time.

Change-Id: I91121b270a2ebc667543dad1324f37285daad893
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/40565
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/crypto/cpu-intel.c b/crypto/cpu-intel.c
index cc41fc4..53ece95 100644
--- a/crypto/cpu-intel.c
+++ b/crypto/cpu-intel.c
@@ -123,7 +123,9 @@
 // and |out[1]|. See the comment in |OPENSSL_cpuid_setup| about this.
 static void handle_cpu_env(uint32_t *out, const char *in) {
   const int invert = in[0] == '~';
-  const int hex = in[invert] == '0' && in[invert+1] == 'x';
+  const int or = in[0] == '|';
+  const int skip_first_byte = invert || or;
+  const int hex = in[skip_first_byte] == '0' && in[skip_first_byte+1] == 'x';
 
   int sscanf_result;
   uint64_t v;
@@ -140,6 +142,9 @@
   if (invert) {
     out[0] &= ~v;
     out[1] &= ~(v >> 32);
+  } else if (or) {
+    out[0] |= v;
+    out[1] |= (v >> 32);
   } else {
     out[0] = v;
     out[1] = v >> 32;
@@ -264,10 +269,14 @@
 
   // OPENSSL_ia32cap can contain zero, one or two values, separated with a ':'.
   // Each value is a 64-bit, unsigned value which may start with "0x" to
-  // indicate a hex value. Prior to the 64-bit value, a '~' may be given.
+  // indicate a hex value. Prior to the 64-bit value, a '~' or '|' may be given.
   //
-  // If '~' isn't present, then the value is taken as the result of the CPUID.
-  // Otherwise the value is inverted and ANDed with the probed CPUID result.
+  // If the '~' prefix is present:
+  //   the value is inverted and ANDed with the probed CPUID result
+  // If the '|' prefix is present:
+  //   the value is ORed with the probed CPUID result
+  // Otherwise:
+  //   the value is taken as the result of the CPUID
   //
   // The first value determines OPENSSL_ia32cap_P[0] and [1]. The second [2]
   // and [3].
diff --git a/crypto/fipsmodule/rand/internal.h b/crypto/fipsmodule/rand/internal.h
index 280aae4..cf77bb3 100644
--- a/crypto/fipsmodule/rand/internal.h
+++ b/crypto/fipsmodule/rand/internal.h
@@ -47,10 +47,16 @@
 void CRYPTO_sysrand_for_seed(uint8_t *buf, size_t len);
 
 // CRYPTO_sysrand_if_available fills |len| bytes at |buf| with entropy from the
-// operating system, if the entropy pool is initialized. If it is uninitialized,
-// it will not block and will instead fill |buf| with all zeros or early
-// /dev/urandom output.
-void CRYPTO_sysrand_if_available(uint8_t *buf, size_t len);
+// operating system, or early /dev/urandom data, and returns 1, _if_ the entropy
+// pool is initialized or if getrandom() is not available and not in FIPS mode.
+// Otherwise it will not block and will instead fill |buf| with all zeros and
+// return 0.
+int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len);
+#else
+OPENSSL_INLINE int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) {
+  CRYPTO_sysrand(buf, len);
+  return 1;
+}
 #endif
 
 // rand_fork_unsafe_buffering_enabled returns whether fork-unsafe buffering has
@@ -105,10 +111,19 @@
 
 
 #if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM)
+
 OPENSSL_INLINE int have_rdrand(void) {
   return (OPENSSL_ia32cap_get()[1] & (1u << 30)) != 0;
 }
 
+// have_fast_rdrand returns true if RDRAND is supported and it's reasonably
+// fast. Concretely the latter is defined by whether the chip is Intel (fast) or
+// not (assumed slow).
+OPENSSL_INLINE int have_fast_rdrand(void) {
+  const uint32_t *const ia32cap = OPENSSL_ia32cap_get();
+  return (ia32cap[1] & (1u << 30)) && (ia32cap[0] & (1u << 30));
+}
+
 // CRYPTO_rdrand writes eight bytes of random data from the hardware RNG to
 // |out|. It returns one on success or zero on hardware failure.
 int CRYPTO_rdrand(uint8_t out[8]);
@@ -117,6 +132,17 @@
 // the hardware RNG. The |len| argument must be a multiple of eight. It returns
 // one on success and zero on hardware failure.
 int CRYPTO_rdrand_multiple8_buf(uint8_t *buf, size_t len);
+
+#else  // OPENSSL_X86_64 && !OPENSSL_NO_ASM
+
+OPENSSL_INLINE int have_rdrand(void) {
+  return 0;
+}
+
+OPENSSL_INLINE int have_fast_rdrand(void) {
+  return 0;
+}
+
 #endif  // OPENSSL_X86_64 && !OPENSSL_NO_ASM
 
 
diff --git a/crypto/fipsmodule/rand/rand.c b/crypto/fipsmodule/rand/rand.c
index 87d7b30..7bed579 100644
--- a/crypto/fipsmodule/rand/rand.c
+++ b/crypto/fipsmodule/rand/rand.c
@@ -125,11 +125,9 @@
 
 #if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \
     !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE)
-static int hwrand(uint8_t *buf, const size_t len) {
-  if (!have_rdrand()) {
-    return 0;
-  }
-
+// rdrand should only be called if either |have_rdrand| or |have_fast_rdrand|
+// returned true.
+static int rdrand(uint8_t *buf, const size_t len) {
   const size_t len_multiple8 = len & ~7;
   if (!CRYPTO_rdrand_multiple8_buf(buf, len_multiple8)) {
     return 0;
@@ -157,7 +155,7 @@
 
 #else
 
-static int hwrand(uint8_t *buf, size_t len) {
+static int rdrand(uint8_t *buf, size_t len) {
   return 0;
 }
 
@@ -168,7 +166,8 @@
 static void rand_get_seed(struct rand_thread_state *state,
                           uint8_t seed[CTR_DRBG_ENTROPY_LEN]) {
   if (!state->last_block_valid) {
-    if (!hwrand(state->last_block, sizeof(state->last_block))) {
+    if (!have_rdrand() ||
+        !rdrand(state->last_block, sizeof(state->last_block))) {
       CRYPTO_sysrand_for_seed(state->last_block, sizeof(state->last_block));
     }
     state->last_block_valid = 1;
@@ -179,8 +178,8 @@
 #define FIPS_OVERREAD 10
   uint8_t entropy[CTR_DRBG_ENTROPY_LEN * FIPS_OVERREAD];
 
-  int used_hwrand = hwrand(entropy, sizeof(entropy));
-  if (!used_hwrand) {
+  int used_rdrand = have_rdrand() && rdrand(entropy, sizeof(entropy));
+  if (!used_rdrand) {
     CRYPTO_sysrand_for_seed(entropy, sizeof(entropy));
   }
 
@@ -215,7 +214,7 @@
 #if defined(OPENSSL_URANDOM)
   // If we used RDRAND, also opportunistically read from the system. This avoids
   // solely relying on the hardware once the entropy pool has been initialized.
-  if (used_hwrand) {
+  if (used_rdrand) {
     CRYPTO_sysrand_if_available(entropy, CTR_DRBG_ENTROPY_LEN);
     for (size_t i = 0; i < CTR_DRBG_ENTROPY_LEN; i++) {
       seed[i] ^= entropy[i];
@@ -246,15 +245,24 @@
   // don't reseed with it so, from the point of view of FIPS, this doesn't
   // provide “prediction resistance”. But, in practice, it does.
   uint8_t additional_data[32];
-  if (!hwrand(additional_data, sizeof(additional_data))) {
+  // Intel chips have fast RDRAND instructions while, in other cases, RDRAND can
+  // be _slower_ than a system call.
+  if (!have_fast_rdrand() ||
+      !rdrand(additional_data, sizeof(additional_data))) {
     // Without a hardware RNG to save us from address-space duplication, the OS
     // entropy is used. This can be expensive (one read per |RAND_bytes| call)
     // and so can be disabled by applications that we have ensured don't fork
     // and aren't at risk of VM cloning.
-    if (!rand_fork_unsafe_buffering_enabled()) {
-      CRYPTO_sysrand(additional_data, sizeof(additional_data));
-    } else {
+    if (rand_fork_unsafe_buffering_enabled()) {
       OPENSSL_memset(additional_data, 0, sizeof(additional_data));
+    } else if (!have_rdrand()) {
+      // No alternative so block for OS entropy.
+      CRYPTO_sysrand(additional_data, sizeof(additional_data));
+    } else if (!CRYPTO_sysrand_if_available(additional_data,
+                                            sizeof(additional_data)) &&
+               !rdrand(additional_data, sizeof(additional_data))) {
+      // RDRAND failed: block for OS entropy.
+      CRYPTO_sysrand(additional_data, sizeof(additional_data));
     }
   }
 
diff --git a/crypto/fipsmodule/rand/urandom.c b/crypto/fipsmodule/rand/urandom.c
index 6ca3400..c20340a 100644
--- a/crypto/fipsmodule/rand/urandom.c
+++ b/crypto/fipsmodule/rand/urandom.c
@@ -431,16 +431,18 @@
 #endif
 }
 
-void CRYPTO_sysrand_if_available(uint8_t *out, size_t requested) {
-  // Return all zeros if |fill_with_entropy| fails.
-  OPENSSL_memset(out, 0, requested);
+#endif  // BORINGSSL_FIPS
 
-  if (!fill_with_entropy(out, requested, /*block=*/0, /*seed=*/0) &&
-      errno != EAGAIN) {
+int CRYPTO_sysrand_if_available(uint8_t *out, size_t requested) {
+  if (fill_with_entropy(out, requested, /*block=*/0, /*seed=*/0)) {
+    return 1;
+  } else if (errno == EAGAIN) {
+    OPENSSL_memset(out, 0, requested);
+    return 0;
+  } else {
     perror("opportunistic entropy fill failed");
     abort();
   }
 }
-#endif  // BORINGSSL_FIPS
 
 #endif  // OPENSSL_URANDOM
diff --git a/crypto/fipsmodule/rand/urandom_test.cc b/crypto/fipsmodule/rand/urandom_test.cc
index 38b0202..266a603 100644
--- a/crypto/fipsmodule/rand/urandom_test.cc
+++ b/crypto/fipsmodule/rand/urandom_test.cc
@@ -32,10 +32,6 @@
 #define PTRACE_O_EXITKILL (1 << 20)
 #endif
 
-#if defined(OPENSSL_NO_ASM)
-static int have_rdrand() { return 0; }
-#endif
-
 // This test can be run with $OPENSSL_ia32cap=~0x4000000000000000 in order to
 // simulate the absence of RDRAND of machines that have it.
 
@@ -423,18 +419,49 @@
         !sysrand(true, kAdditionalDataLength)) {
       return ret;
     }
-  } else {
-    // Opportuntistic entropy draw in FIPS mode because RDRAND was used.
-    // In non-FIPS mode it's just drawn from |CRYPTO_sysrand| in a blocking
-    // way.
-    if (!sysrand(!is_fips, CTR_DRBG_ENTROPY_LEN)) {
-      return ret;
-    }
+  } else if (
+      // First additional data. If fast RDRAND isn't available then a
+      // non-blocking OS entropy draw will be tried.
+      (!have_fast_rdrand() && !sysrand(false, kAdditionalDataLength)) ||
+      // Opportuntistic entropy draw in FIPS mode because RDRAND was used.
+      // In non-FIPS mode it's just drawn from |CRYPTO_sysrand| in a blocking
+      // way.
+      !sysrand(!is_fips, CTR_DRBG_ENTROPY_LEN) ||
+      // Second entropy draw's additional data.
+      (!have_fast_rdrand() && !sysrand(false, kAdditionalDataLength))) {
+    return ret;
   }
 
   return ret;
 }
 
+static void CheckInvariants(const std::vector<Event> &events) {
+  // If RDRAND is available then there should be no blocking syscalls in FIPS
+  // mode.
+#if defined(BORINGSSL_FIPS)
+  if (have_rdrand()) {
+    for (const auto &event : events) {
+      switch (event.type) {
+        case Event::Syscall::kGetRandom:
+          if ((event.flags & GRND_NONBLOCK) == 0) {
+            ADD_FAILURE() << "Blocking getrandom found with RDRAND: "
+                          << ToString(events);
+          }
+          break;
+
+        case Event::Syscall::kUrandomIoctl:
+          ADD_FAILURE() << "Urandom polling found with RDRAND: "
+                        << ToString(events);
+          break;
+
+        default:
+          break;
+      }
+    }
+  }
+#endif
+}
+
 // Tests that |TestFunctionPRNGModel| is a correct model for the code in
 // urandom.c, at least to the limits of the the |Event| type.
 TEST(URandomTest, Test) {
@@ -453,6 +480,7 @@
     TRACE_FLAG(URANDOM_ERROR);
 
     const std::vector<Event> expected_trace = TestFunctionPRNGModel(flags);
+    CheckInvariants(expected_trace);
     std::vector<Event> actual_trace;
     GetTrace(&actual_trace, flags, TestFunction);
 
diff --git a/crypto/rand_extra/deterministic.c b/crypto/rand_extra/deterministic.c
index 34547ea..36774c8 100644
--- a/crypto/rand_extra/deterministic.c
+++ b/crypto/rand_extra/deterministic.c
@@ -49,8 +49,9 @@
   CRYPTO_sysrand(out, requested);
 }
 
-void CRYPTO_sysrand_if_available(uint8_t *out, size_t requested) {
+int CRYPTO_sysrand_if_available(uint8_t *out, size_t requested) {
   CRYPTO_sysrand(out, requested);
+  return 1;
 }
 
 #endif  // BORINGSSL_UNSAFE_DETERMINISTIC_MODE
diff --git a/util/all_tests.json b/util/all_tests.json
index c4c2787..a251e5c 100644
--- a/util/all_tests.json
+++ b/util/all_tests.json
@@ -13,10 +13,21 @@
     "cmd": ["crypto/urandom_test"]
   },
   {
+    "comment": "Without RDRAND",
     "cmd": ["crypto/urandom_test"],
     "env": ["OPENSSL_ia32cap=~0x4000000000000000"]
   },
   {
+    "comment": "Potentially with RDRAND, but not Intel",
+    "cmd": ["crypto/urandom_test"],
+    "env": ["OPENSSL_ia32cap=~0x0000000040000000"]
+  },
+  {
+    "comment": "Potentially with RDRAND, and forced to Intel",
+    "cmd": ["crypto/urandom_test"],
+    "env": ["OPENSSL_ia32cap=|0x0000000040000000"]
+  },
+  {
     "cmd": ["crypto/crypto_test", "--fork_unsafe_buffering", "--gtest_filter=RandTest.*:-RandTest.Fork"]
   },
   {