rand: new-style locking and support rdrand.

Pure /dev/urandom, no buffering (previous behaviour):
Did 2320000 RNG (16 bytes) operations in 3000082us (773312.2 ops/sec): 12.4 MB/s
Did 209000 RNG (256 bytes) operations in 3011984us (69389.5 ops/sec): 17.8 MB/s
Did 6851 RNG (8192 bytes) operations in 3052027us (2244.7 ops/sec): 18.4 MB/s

Pure rdrand speed:
Did 34930500 RNG (16 bytes) operations in 3000021us (11643418.5 ops/sec): 186.3 MB/s
Did 2444000 RNG (256 bytes) operations in 3000164us (814622.1 ops/sec): 208.5 MB/s
Did 80000 RNG (8192 bytes) operations in 3020968us (26481.6 ops/sec): 216.9 MB/s

rdrand + ChaCha (as in this change):
Did 19498000 RNG (16 bytes) operations in 3000086us (6499147.0 ops/sec): 104.0 MB/s
Did 1964000 RNG (256 bytes) operations in 3000566us (654543.2 ops/sec): 167.6 MB/s
Did 62000 RNG (8192 bytes) operations in 3034090us (20434.5 ops/sec): 167.4 MB/s

Change-Id: Ie17045650cfe75858e4498ac28dbc4dcf8338376
Reviewed-on: https://boringssl-review.googlesource.com/4328
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/internal.h b/crypto/internal.h
index 6a8d5b2..9c5d487 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -434,6 +434,7 @@
  * stored. */
 typedef enum {
   OPENSSL_THREAD_LOCAL_ERR = 0,
+  OPENSSL_THREAD_LOCAL_RAND,
   OPENSSL_THREAD_LOCAL_TEST,
   NUM_OPENSSL_THREAD_LOCALS,
 } thread_local_data_t;
diff --git a/crypto/rand/CMakeLists.txt b/crypto/rand/CMakeLists.txt
index 23c1b24..374d8f1 100644
--- a/crypto/rand/CMakeLists.txt
+++ b/crypto/rand/CMakeLists.txt
@@ -1,5 +1,13 @@
 include_directories(. .. ../../include)
 
+if (${ARCH} STREQUAL "x86_64")
+  set(
+    RAND_ARCH_SOURCES
+
+    rdrand-x86_64.${ASM_EXT}
+  )
+endif()
+
 add_library(
   rand
 
@@ -8,4 +16,9 @@
   rand.c
   urandom.c
   windows.c
+  hwrand.c
+
+  ${RAND_ARCH_SOURCES}
 )
+
+perlasm(rdrand-x86_64.${ASM_EXT} asm/rdrand-x86_64.pl)
diff --git a/crypto/rand/asm/rdrand-x86_64.pl b/crypto/rand/asm/rdrand-x86_64.pl
new file mode 100644
index 0000000..a917611
--- /dev/null
+++ b/crypto/rand/asm/rdrand-x86_64.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+print<<___;
+.text
+
+.globl	CRYPTO_rdrand
+.type	CRYPTO_rdrand,\@function,1
+.align	16
+CRYPTO_rdrand:
+	.byte 0x48, 0x0f, 0xc7, 0xf0
+	retq
+___
+
+close STDOUT;	# flush
diff --git a/crypto/rand/hwrand.c b/crypto/rand/hwrand.c
new file mode 100644
index 0000000..0d2833a
--- /dev/null
+++ b/crypto/rand/hwrand.c
@@ -0,0 +1,56 @@
+/* Copyright (c) 2015, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/rand.h>
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <openssl/cpu.h>
+
+
+#if defined(OPENSSL_X86_64)
+
+int CRYPTO_have_hwrand(void) {
+  return (OPENSSL_ia32cap_P[1] & (1u << 30)) != 0;
+}
+
+/* CRYPTO_rdrand is defined in asm/rdrand-x86_64.pl */
+extern uint64_t CRYPTO_rdrand();
+
+void CRYPTO_hwrand(uint8_t *buf, size_t len) {
+  while (len >= 8) {
+    uint64_t rand = CRYPTO_rdrand();
+    memcpy(buf, &rand, sizeof(rand));
+    len -= sizeof(rand);
+    buf += sizeof(rand);
+  }
+
+  if (len > 0) {
+    uint64_t rand = CRYPTO_rdrand();
+    memcpy(buf, &rand, len);
+  }
+}
+
+#else
+
+int CRYPTO_have_hwrand(void) {
+  return 0;
+}
+
+void CRYPTO_hwrand(uint8_t *buf, size_t len) {
+  abort();
+}
+
+#endif
diff --git a/crypto/rand/internal.h b/crypto/rand/internal.h
new file mode 100644
index 0000000..1cca7f3
--- /dev/null
+++ b/crypto/rand/internal.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2015, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_CRYPTO_RAND_INTERNAL_H
+#define OPENSSL_HEADER_CRYPTO_RAND_INTERNAL_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+/* CRYPTO_sysrand fills |len| bytes at |buf| with entropy from the operating
+ * system. */
+void CRYPTO_sysrand(uint8_t *buf, size_t len);
+
+/* CRYPTO_have_hwrand returns one iff |CRYPTO_hwrand| can be called to generate
+ * hardware entropy. */
+int CRYPTO_have_hwrand(void);
+
+/* CRYPTO_hwrand fills |len| bytes at |buf| with entropy from the hardware.
+ * This function can only be called if |CRYPTO_have_hwrand| returns one. */
+void CRYPTO_hwrand(uint8_t *buf, size_t len);
+
+
+#if defined(__cplusplus)
+}  /* extern C */
+#endif
+
+#endif  /* OPENSSL_HEADER_CRYPTO_RAND_INTERNAL_H */
diff --git a/crypto/rand/rand.c b/crypto/rand/rand.c
index 6780b6c..5f94a15 100644
--- a/crypto/rand/rand.c
+++ b/crypto/rand/rand.c
@@ -14,6 +14,134 @@
 
 #include <openssl/rand.h>
 
+#include <string.h>
+
+#include <openssl/mem.h>
+
+#include "internal.h"
+#include "../internal.h"
+
+
+/* It's assumed that the operating system always has an unfailing source of
+ * entropy which is accessed via |CRYPTO_sysrand|. (If the operating system
+ * entropy source fails, it's up to |CRYPTO_sysrand| to abort the process—we
+ * don't try to handle it.)
+ *
+ * In addition, the hardware may provide a low-latency RNG. Intel's rdrand
+ * instruction is the canonical example of this. When a hardware RNG is
+ * available we don't need to worry about an RNG failure arising from fork()ing
+ * the process or moving a VM, so we can keep thread-local RNG state and XOR
+ * the hardware entropy in.
+ *
+ * (We assume that the OS entropy is safe from fork()ing and VM duplication.
+ * This might be a bit of a leap of faith, esp on Windows, but there's nothing
+ * that we can do about it.) */
+
+/* rand_thread_state contains the per-thread state for the RNG. This is only
+ * used if the system has support for a hardware RNG. */
+struct rand_thread_state {
+  uint8_t key[32];
+  uint64_t calls_used;
+  size_t bytes_used;
+  uint8_t partial_block[64];
+  unsigned partial_block_used;
+};
+
+/* kMaxCallsPerRefresh is the maximum number of |RAND_bytes| calls that we'll
+ * serve before reading a new key from the operating system. This only applies
+ * if we have a hardware RNG. */
+static const unsigned kMaxCallsPerRefresh = 1024;
+
+/* kMaxBytesPerRefresh is the maximum number of bytes that we'll return from
+ * |RAND_bytes| before reading a new key from the operating system. This only
+ * applies if we have a hardware RNG. */
+static const uint64_t kMaxBytesPerRefresh = 1024 * 1024;
+
+/* rand_thread_state_free frees a |rand_thread_state|. This is called when a
+ * thread exits. */
+static void rand_thread_state_free(void *state) {
+  if (state == NULL) {
+    return;
+  }
+
+  OPENSSL_cleanse(state, sizeof(struct rand_thread_state));
+  OPENSSL_free(state);
+}
+
+extern void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len,
+                             const uint8_t key[32], const uint8_t nonce[8],
+                             size_t counter);
+
+int RAND_bytes(uint8_t *buf, const size_t len) {
+  if (len == 0) {
+    return 1;
+  }
+
+  if (!CRYPTO_have_hwrand()) {
+    /* Without a hardware RNG to save us from address-space duplication, the OS
+     * entropy is used directly. */
+    CRYPTO_sysrand(buf, len);
+    return 1;
+  }
+
+  struct rand_thread_state *state =
+      CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_RAND);
+  if (state == NULL) {
+    state = OPENSSL_malloc(sizeof(struct rand_thread_state));
+    if (state == NULL ||
+        !CRYPTO_set_thread_local(OPENSSL_THREAD_LOCAL_RAND, state,
+                                 rand_thread_state_free)) {
+      CRYPTO_sysrand(buf, len);
+      return 1;
+    }
+
+    state->calls_used = kMaxCallsPerRefresh;
+  }
+
+  if (state->calls_used >= kMaxCallsPerRefresh ||
+      state->bytes_used >= kMaxBytesPerRefresh) {
+    CRYPTO_sysrand(state->key, sizeof(state->key));
+    state->calls_used = 0;
+    state->bytes_used = 0;
+    state->partial_block_used = sizeof(state->partial_block);
+  }
+
+  CRYPTO_hwrand(buf, len);
+
+  if (len >= sizeof(state->partial_block)) {
+    size_t remaining = len;
+    while (remaining > 0) {
+      // kMaxBytesPerCall is only 2GB, while ChaCha can handle 256GB. But this
+      // is sufficient and easier on 32-bit.
+      static const size_t kMaxBytesPerCall = 0x80000000;
+      size_t todo = remaining;
+      if (todo > kMaxBytesPerCall) {
+        todo = kMaxBytesPerCall;
+      }
+      CRYPTO_chacha_20(buf, buf, todo, state->key,
+                       (uint8_t *)&state->calls_used, 0);
+      buf += todo;
+      remaining -= todo;
+      state->calls_used++;
+    }
+  } else {
+    if (sizeof(state->partial_block) - state->partial_block_used < len) {
+      CRYPTO_chacha_20(state->partial_block, state->partial_block,
+                       sizeof(state->partial_block), state->key,
+                       (uint8_t *)&state->calls_used, 0);
+      state->partial_block_used = 0;
+    }
+
+    unsigned i;
+    for (i = 0; i < len; i++) {
+      buf[i] ^= state->partial_block[state->partial_block_used++];
+    }
+    state->calls_used++;
+  }
+  state->bytes_used += len;
+
+  return 1;
+}
 
 int RAND_pseudo_bytes(uint8_t *buf, size_t len) {
   return RAND_bytes(buf, len);
diff --git a/crypto/rand/urandom.c b/crypto/rand/urandom.c
index 05043fe..788a979 100644
--- a/crypto/rand/urandom.c
+++ b/crypto/rand/urandom.c
@@ -25,6 +25,9 @@
 #include <openssl/thread.h>
 #include <openssl/mem.h>
 
+#include "internal.h"
+#include "../internal.h"
+
 
 /* This file implements a PRNG by reading from /dev/urandom, optionally with a
  * fork-safe buffer.
@@ -72,20 +75,22 @@
 /* rand_bytes_per_buf is the number of actual entropy bytes in a buffer. */
 static const size_t rand_bytes_per_buf = BUF_SIZE - sizeof(struct rand_buffer);
 
+static struct CRYPTO_STATIC_MUTEX global_lock = CRYPTO_STATIC_MUTEX_INIT;
+
 /* list_head is the start of a global, linked-list of rand_buffer objects. It's
- * protected by CRYPTO_LOCK_RAND. */
+ * protected by |global_lock|. */
 static struct rand_buffer *list_head;
 
 /* urandom_fd is a file descriptor to /dev/urandom. It's protected by
- * CRYPTO_LOCK_RAND. */
+ * |global_lock|. */
 static int urandom_fd = -2;
 
 /* urandom_buffering controls whether buffering is enabled (1) or not (0). This
- * is protected by CRYPTO_LOCK_RAND. */
+ * is protected by |global_lock|. */
 static int urandom_buffering = 0;
 
 /* urandom_get_fd_locked returns a file descriptor to /dev/urandom. The caller
- * of this function must hold CRYPTO_LOCK_RAND. */
+ * of this function must hold |global_lock|. */
 static int urandom_get_fd_locked(void) {
   if (urandom_fd != -2) {
     return urandom_fd;
@@ -100,7 +105,7 @@
 void RAND_cleanup(void) {
   struct rand_buffer *cur;
 
-  CRYPTO_w_lock(CRYPTO_LOCK_RAND);
+  CRYPTO_STATIC_MUTEX_lock_write(&global_lock);
   while ((cur = list_head)) {
     list_head = cur->next;
     OPENSSL_free(cur);
@@ -110,7 +115,7 @@
   }
   urandom_fd = -2;
   list_head = NULL;
-  CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
+  CRYPTO_STATIC_MUTEX_unlock(&global_lock);
 }
 
 /* read_full reads exactly |len| bytes from |fd| into |out| and returns 1. In
@@ -133,36 +138,34 @@
   return 1;
 }
 
-/* urandom_rand_pseudo_bytes puts |num| random bytes into |out|. It returns
- * one on success and zero otherwise. */
-int RAND_bytes(uint8_t *out, size_t requested) {
+/* CRYPTO_sysrand puts |num| random bytes into |out|. */
+void CRYPTO_sysrand(uint8_t *out, size_t requested) {
   int fd;
   struct rand_buffer *buf;
   size_t todo;
   pid_t pid, ppid;
 
   if (requested == 0) {
-    return 1;
+    return;
   }
 
-  CRYPTO_w_lock(CRYPTO_LOCK_RAND);
+  CRYPTO_STATIC_MUTEX_lock_write(&global_lock);
   fd = urandom_get_fd_locked();
 
   if (fd < 0) {
-    CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
+    CRYPTO_STATIC_MUTEX_unlock(&global_lock);
     abort();
-    return 0;
+    return;
   }
 
   /* If buffering is not enabled, or if the request is large, then the
    * result comes directly from urandom. */
   if (!urandom_buffering || requested > BUF_SIZE / 2) {
-    CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
+    CRYPTO_STATIC_MUTEX_unlock(&global_lock);
     if (!read_full(fd, out, requested)) {
       abort();
-      return 0;
     }
-    return 1;
+    return;
   }
 
   pid = getpid();
@@ -174,8 +177,8 @@
         rand_bytes_per_buf - buf->used >= requested) {
       memcpy(out, &buf->rand[buf->used], requested);
       buf->used += requested;
-      CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
-      return 1;
+      CRYPTO_STATIC_MUTEX_unlock(&global_lock);
+      return;
     }
 
     /* If we don't immediately have enough entropy with the correct
@@ -184,13 +187,13 @@
     if (buf) {
       list_head = buf->next;
     }
-    CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
+    CRYPTO_STATIC_MUTEX_unlock(&global_lock);
 
     if (!buf) {
       buf = (struct rand_buffer *)OPENSSL_malloc(BUF_SIZE);
       if (!buf) {
         abort();
-        return 0;
+        return;
       }
       /* The buffer doesn't contain any random bytes yet
        * so we mark it as fully used so that it will be
@@ -208,7 +211,7 @@
     /* We have forked and so cannot use these bytes as they
      * may have been used in another process. */
     OPENSSL_free(buf);
-    CRYPTO_w_lock(CRYPTO_LOCK_RAND);
+    CRYPTO_STATIC_MUTEX_lock_write(&global_lock);
   }
 
   while (requested > 0) {
@@ -228,18 +231,17 @@
     if (!read_full(fd, buf->rand, rand_bytes_per_buf)) {
       OPENSSL_free(buf);
       abort();
-      return 0;
+      return;
     }
 
     buf->used = 0;
   }
 
-  CRYPTO_w_lock(CRYPTO_LOCK_RAND);
+  CRYPTO_STATIC_MUTEX_lock_write(&global_lock);
   assert(list_head != buf);
   buf->next = list_head;
   list_head = buf;
-  CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
-  return 1;
+  CRYPTO_STATIC_MUTEX_unlock(&global_lock);
 }
 
 #endif  /* !OPENSSL_WINDOWS */
diff --git a/crypto/rand/windows.c b/crypto/rand/windows.c
index 66d977f..7bfcb1d 100644
--- a/crypto/rand/windows.c
+++ b/crypto/rand/windows.c
@@ -32,11 +32,13 @@
 
 #pragma warning(pop)
 
+#include "internal.h"
+
 
 void RAND_cleanup(void) {
 }
 
-int RAND_bytes(uint8_t *out, size_t requested) {
+void CRYPTO_sysrand(uint8_t *out, size_t requested) {
   while (requested > 0) {
     ULONG output_bytes_this_pass = ULONG_MAX;
     if (requested < output_bytes_this_pass) {
@@ -48,7 +50,7 @@
     requested -= output_bytes_this_pass;
     out += output_bytes_this_pass;
   }
-  return 1;
+  return;
 }
 
 #endif  /* OPENSSL_WINDOWS */
diff --git a/include/openssl/rand.h b/include/openssl/rand.h
index 8e3bc30..0b2ead8 100644
--- a/include/openssl/rand.h
+++ b/include/openssl/rand.h
@@ -25,8 +25,7 @@
 /* Random number generation. */
 
 
-/* RAND_bytes writes |len| bytes of random data to |buf|. It returns one on
- * success and zero on otherwise. */
+/* RAND_bytes writes |len| bytes of random data to |buf| and returns one. */
 OPENSSL_EXPORT int RAND_bytes(uint8_t *buf, size_t len);
 
 /* RAND_cleanup frees any resources used by the RNG. This is not safe if other