Extract common rotl/rotr functions.

We have a ton of per-file rotation functions, often with generic names
that do not tell you whether they are uint32_t vs uint64_t, or rotl vs
rotr.

Additionally, (x >> r) | (x << (32 - r)) is UB at r = 0.
(x >> r) | (x << ((-r) & 31)) works for 0 <= r < 32, which is what
cast.c does. GCC and Clang recognize this pattern as a rotate, but MSVC
doesn't. MSVC does, however, provide functions for this.

We usually rotate by a non-zero constant, which makes this moot, but
rotation comes up often enough that it's worth extracting out. Some
particular changes to call out:

- I've switched sha256.c from rotl to rotr. There was a comment
  explaining why it differed from the specification. Now that we have
  both functions, it's simpler to just match the specification.

- I've dropped all the inline assembly from sha512.c. Compilers should
  be able to recognize rotations in 2021.

Change-Id: Ia1030e8bfe94dad92514ed1c28777447c48b82f9
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/49765
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/blake2/blake2.c b/crypto/blake2/blake2.c
index e3c560f..096d61d 100644
--- a/crypto/blake2/blake2.c
+++ b/crypto/blake2/blake2.c
@@ -42,19 +42,17 @@
     // clang-format on
 };
 
-#define RIGHT_ROTATE(v, n) (((v) >> (n)) | ((v) << (64 - (n))))
-
 // https://tools.ietf.org/html/rfc7693#section-3.1
 static void blake2b_mix(uint64_t v[16], int a, int b, int c, int d, uint64_t x,
                         uint64_t y) {
   v[a] = v[a] + v[b] + x;
-  v[d] = RIGHT_ROTATE(v[d] ^ v[a], 32);
+  v[d] = CRYPTO_rotr_u64(v[d] ^ v[a], 32);
   v[c] = v[c] + v[d];
-  v[b] = RIGHT_ROTATE(v[b] ^ v[c], 24);
+  v[b] = CRYPTO_rotr_u64(v[b] ^ v[c], 24);
   v[a] = v[a] + v[b] + y;
-  v[d] = RIGHT_ROTATE(v[d] ^ v[a], 16);
+  v[d] = CRYPTO_rotr_u64(v[d] ^ v[a], 16);
   v[c] = v[c] + v[d];
-  v[b] = RIGHT_ROTATE(v[b] ^ v[c], 63);
+  v[b] = CRYPTO_rotr_u64(v[b] ^ v[c], 63);
 }
 
 static void blake2b_transform(
diff --git a/crypto/chacha/chacha.c b/crypto/chacha/chacha.c
index b539f99..64ca1c4 100644
--- a/crypto/chacha/chacha.c
+++ b/crypto/chacha/chacha.c
@@ -25,22 +25,20 @@
 #include "internal.h"
 
 
-#define U8TO32_LITTLE(p)                              \
-  (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \
-   ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
-
 // sigma contains the ChaCha constants, which happen to be an ASCII string.
 static const uint8_t sigma[16] = { 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3',
                                    '2', '-', 'b', 'y', 't', 'e', ' ', 'k' };
 
-#define ROTATE(v, n) (((v) << (n)) | ((v) >> (32 - (n))))
-
 // QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round.
-#define QUARTERROUND(a, b, c, d)                \
-  x[a] += x[b]; x[d] = ROTATE(x[d] ^ x[a], 16); \
-  x[c] += x[d]; x[b] = ROTATE(x[b] ^ x[c], 12); \
-  x[a] += x[b]; x[d] = ROTATE(x[d] ^ x[a],  8); \
-  x[c] += x[d]; x[b] = ROTATE(x[b] ^ x[c],  7);
+#define QUARTERROUND(a, b, c, d)           \
+  x[a] += x[b];                            \
+  x[d] = CRYPTO_rotl_u32(x[d] ^ x[a], 16); \
+  x[c] += x[d];                            \
+  x[b] = CRYPTO_rotl_u32(x[b] ^ x[c], 12); \
+  x[a] += x[b];                            \
+  x[d] = CRYPTO_rotl_u32(x[d] ^ x[a], 8);  \
+  x[c] += x[d];                            \
+  x[b] = CRYPTO_rotl_u32(x[b] ^ x[c], 7);
 
 void CRYPTO_hchacha20(uint8_t out[32], const uint8_t key[32],
                       const uint8_t nonce[16]) {
@@ -71,24 +69,25 @@
                       uint32_t counter) {
   assert(!buffers_alias(out, in_len, in, in_len) || in == out);
 
-  uint32_t counter_nonce[4];  counter_nonce[0] = counter;
-  counter_nonce[1] = U8TO32_LITTLE(nonce + 0);
-  counter_nonce[2] = U8TO32_LITTLE(nonce + 4);
-  counter_nonce[3] = U8TO32_LITTLE(nonce + 8);
+  uint32_t counter_nonce[4];
+  counter_nonce[0] = counter;
+  counter_nonce[1] = CRYPTO_load_u32_le(nonce + 0);
+  counter_nonce[2] = CRYPTO_load_u32_le(nonce + 4);
+  counter_nonce[3] = CRYPTO_load_u32_le(nonce + 8);
 
   const uint32_t *key_ptr = (const uint32_t *)key;
 #if !defined(OPENSSL_X86) && !defined(OPENSSL_X86_64)
   // The assembly expects the key to be four-byte aligned.
   uint32_t key_u32[8];
   if ((((uintptr_t)key) & 3) != 0) {
-    key_u32[0] = U8TO32_LITTLE(key + 0);
-    key_u32[1] = U8TO32_LITTLE(key + 4);
-    key_u32[2] = U8TO32_LITTLE(key + 8);
-    key_u32[3] = U8TO32_LITTLE(key + 12);
-    key_u32[4] = U8TO32_LITTLE(key + 16);
-    key_u32[5] = U8TO32_LITTLE(key + 20);
-    key_u32[6] = U8TO32_LITTLE(key + 24);
-    key_u32[7] = U8TO32_LITTLE(key + 28);
+    key_u32[0] = CRYPTO_load_u32_le(key + 0);
+    key_u32[1] = CRYPTO_load_u32_le(key + 4);
+    key_u32[2] = CRYPTO_load_u32_le(key + 8);
+    key_u32[3] = CRYPTO_load_u32_le(key + 12);
+    key_u32[4] = CRYPTO_load_u32_le(key + 16);
+    key_u32[5] = CRYPTO_load_u32_le(key + 20);
+    key_u32[6] = CRYPTO_load_u32_le(key + 24);
+    key_u32[7] = CRYPTO_load_u32_le(key + 28);
 
     key_ptr = key_u32;
   }
@@ -99,14 +98,6 @@
 
 #else
 
-#define U32TO8_LITTLE(p, v)    \
-  {                            \
-    (p)[0] = (v >> 0) & 0xff;  \
-    (p)[1] = (v >> 8) & 0xff;  \
-    (p)[2] = (v >> 16) & 0xff; \
-    (p)[3] = (v >> 24) & 0xff; \
-  }
-
 // chacha_core performs 20 rounds of ChaCha on the input words in
 // |input| and writes the 64 output bytes to |output|.
 static void chacha_core(uint8_t output[64], const uint32_t input[16]) {
@@ -129,7 +120,7 @@
     x[i] += input[i];
   }
   for (i = 0; i < 16; ++i) {
-    U32TO8_LITTLE(output + 4 * i, x[i]);
+    CRYPTO_store_u32_le(output + 4 * i, x[i]);
   }
 }
 
@@ -142,25 +133,25 @@
   uint8_t buf[64];
   size_t todo, i;
 
-  input[0] = U8TO32_LITTLE(sigma + 0);
-  input[1] = U8TO32_LITTLE(sigma + 4);
-  input[2] = U8TO32_LITTLE(sigma + 8);
-  input[3] = U8TO32_LITTLE(sigma + 12);
+  input[0] = CRYPTO_load_u32_le(sigma + 0);
+  input[1] = CRYPTO_load_u32_le(sigma + 4);
+  input[2] = CRYPTO_load_u32_le(sigma + 8);
+  input[3] = CRYPTO_load_u32_le(sigma + 12);
 
-  input[4] = U8TO32_LITTLE(key + 0);
-  input[5] = U8TO32_LITTLE(key + 4);
-  input[6] = U8TO32_LITTLE(key + 8);
-  input[7] = U8TO32_LITTLE(key + 12);
+  input[4] = CRYPTO_load_u32_le(key + 0);
+  input[5] = CRYPTO_load_u32_le(key + 4);
+  input[6] = CRYPTO_load_u32_le(key + 8);
+  input[7] = CRYPTO_load_u32_le(key + 12);
 
-  input[8] = U8TO32_LITTLE(key + 16);
-  input[9] = U8TO32_LITTLE(key + 20);
-  input[10] = U8TO32_LITTLE(key + 24);
-  input[11] = U8TO32_LITTLE(key + 28);
+  input[8] = CRYPTO_load_u32_le(key + 16);
+  input[9] = CRYPTO_load_u32_le(key + 20);
+  input[10] = CRYPTO_load_u32_le(key + 24);
+  input[11] = CRYPTO_load_u32_le(key + 28);
 
   input[12] = counter;
-  input[13] = U8TO32_LITTLE(nonce + 0);
-  input[14] = U8TO32_LITTLE(nonce + 4);
-  input[15] = U8TO32_LITTLE(nonce + 8);
+  input[13] = CRYPTO_load_u32_le(nonce + 0);
+  input[14] = CRYPTO_load_u32_le(nonce + 4);
+  input[15] = CRYPTO_load_u32_le(nonce + 8);
 
   while (in_len > 0) {
     todo = sizeof(buf);
diff --git a/crypto/evp/scrypt.c b/crypto/evp/scrypt.c
index 2feb650..7ec6244 100644
--- a/crypto/evp/scrypt.c
+++ b/crypto/evp/scrypt.c
@@ -32,8 +32,6 @@
 
 OPENSSL_STATIC_ASSERT(sizeof(block_t) == 64, "block_t has padding");
 
-#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
-
 // salsa208_word_specification implements the Salsa20/8 core function, also
 // described in RFC 7914, section 3. It modifies the block at |inout|
 // in-place.
@@ -42,38 +40,38 @@
   OPENSSL_memcpy(&x, inout, sizeof(x));
 
   for (int i = 8; i > 0; i -= 2) {
-    x.words[4] ^= R(x.words[0] + x.words[12], 7);
-    x.words[8] ^= R(x.words[4] + x.words[0], 9);
-    x.words[12] ^= R(x.words[8] + x.words[4], 13);
-    x.words[0] ^= R(x.words[12] + x.words[8], 18);
-    x.words[9] ^= R(x.words[5] + x.words[1], 7);
-    x.words[13] ^= R(x.words[9] + x.words[5], 9);
-    x.words[1] ^= R(x.words[13] + x.words[9], 13);
-    x.words[5] ^= R(x.words[1] + x.words[13], 18);
-    x.words[14] ^= R(x.words[10] + x.words[6], 7);
-    x.words[2] ^= R(x.words[14] + x.words[10], 9);
-    x.words[6] ^= R(x.words[2] + x.words[14], 13);
-    x.words[10] ^= R(x.words[6] + x.words[2], 18);
-    x.words[3] ^= R(x.words[15] + x.words[11], 7);
-    x.words[7] ^= R(x.words[3] + x.words[15], 9);
-    x.words[11] ^= R(x.words[7] + x.words[3], 13);
-    x.words[15] ^= R(x.words[11] + x.words[7], 18);
-    x.words[1] ^= R(x.words[0] + x.words[3], 7);
-    x.words[2] ^= R(x.words[1] + x.words[0], 9);
-    x.words[3] ^= R(x.words[2] + x.words[1], 13);
-    x.words[0] ^= R(x.words[3] + x.words[2], 18);
-    x.words[6] ^= R(x.words[5] + x.words[4], 7);
-    x.words[7] ^= R(x.words[6] + x.words[5], 9);
-    x.words[4] ^= R(x.words[7] + x.words[6], 13);
-    x.words[5] ^= R(x.words[4] + x.words[7], 18);
-    x.words[11] ^= R(x.words[10] + x.words[9], 7);
-    x.words[8] ^= R(x.words[11] + x.words[10], 9);
-    x.words[9] ^= R(x.words[8] + x.words[11], 13);
-    x.words[10] ^= R(x.words[9] + x.words[8], 18);
-    x.words[12] ^= R(x.words[15] + x.words[14], 7);
-    x.words[13] ^= R(x.words[12] + x.words[15], 9);
-    x.words[14] ^= R(x.words[13] + x.words[12], 13);
-    x.words[15] ^= R(x.words[14] + x.words[13], 18);
+    x.words[4] ^= CRYPTO_rotl_u32(x.words[0] + x.words[12], 7);
+    x.words[8] ^= CRYPTO_rotl_u32(x.words[4] + x.words[0], 9);
+    x.words[12] ^= CRYPTO_rotl_u32(x.words[8] + x.words[4], 13);
+    x.words[0] ^= CRYPTO_rotl_u32(x.words[12] + x.words[8], 18);
+    x.words[9] ^= CRYPTO_rotl_u32(x.words[5] + x.words[1], 7);
+    x.words[13] ^= CRYPTO_rotl_u32(x.words[9] + x.words[5], 9);
+    x.words[1] ^= CRYPTO_rotl_u32(x.words[13] + x.words[9], 13);
+    x.words[5] ^= CRYPTO_rotl_u32(x.words[1] + x.words[13], 18);
+    x.words[14] ^= CRYPTO_rotl_u32(x.words[10] + x.words[6], 7);
+    x.words[2] ^= CRYPTO_rotl_u32(x.words[14] + x.words[10], 9);
+    x.words[6] ^= CRYPTO_rotl_u32(x.words[2] + x.words[14], 13);
+    x.words[10] ^= CRYPTO_rotl_u32(x.words[6] + x.words[2], 18);
+    x.words[3] ^= CRYPTO_rotl_u32(x.words[15] + x.words[11], 7);
+    x.words[7] ^= CRYPTO_rotl_u32(x.words[3] + x.words[15], 9);
+    x.words[11] ^= CRYPTO_rotl_u32(x.words[7] + x.words[3], 13);
+    x.words[15] ^= CRYPTO_rotl_u32(x.words[11] + x.words[7], 18);
+    x.words[1] ^= CRYPTO_rotl_u32(x.words[0] + x.words[3], 7);
+    x.words[2] ^= CRYPTO_rotl_u32(x.words[1] + x.words[0], 9);
+    x.words[3] ^= CRYPTO_rotl_u32(x.words[2] + x.words[1], 13);
+    x.words[0] ^= CRYPTO_rotl_u32(x.words[3] + x.words[2], 18);
+    x.words[6] ^= CRYPTO_rotl_u32(x.words[5] + x.words[4], 7);
+    x.words[7] ^= CRYPTO_rotl_u32(x.words[6] + x.words[5], 9);
+    x.words[4] ^= CRYPTO_rotl_u32(x.words[7] + x.words[6], 13);
+    x.words[5] ^= CRYPTO_rotl_u32(x.words[4] + x.words[7], 18);
+    x.words[11] ^= CRYPTO_rotl_u32(x.words[10] + x.words[9], 7);
+    x.words[8] ^= CRYPTO_rotl_u32(x.words[11] + x.words[10], 9);
+    x.words[9] ^= CRYPTO_rotl_u32(x.words[8] + x.words[11], 13);
+    x.words[10] ^= CRYPTO_rotl_u32(x.words[9] + x.words[8], 18);
+    x.words[12] ^= CRYPTO_rotl_u32(x.words[15] + x.words[14], 7);
+    x.words[13] ^= CRYPTO_rotl_u32(x.words[12] + x.words[15], 9);
+    x.words[14] ^= CRYPTO_rotl_u32(x.words[13] + x.words[12], 13);
+    x.words[15] ^= CRYPTO_rotl_u32(x.words[14] + x.words[13], 18);
   }
 
   for (int i = 0; i < 16; ++i) {
diff --git a/crypto/fipsmodule/aes/aes_test.cc b/crypto/fipsmodule/aes/aes_test.cc
index 406e949..eef2567 100644
--- a/crypto/fipsmodule/aes/aes_test.cc
+++ b/crypto/fipsmodule/aes/aes_test.cc
@@ -403,10 +403,6 @@
   return a0 | (a1 << 8) | (a2 << 16) | (a3 << 24);
 }
 
-static uint32_t aes_ref_rot_word(uint32_t in, uint32_t n) {
-  return (in >> n) | (in << (32 - n));
-}
-
 static int aes_ref_set_encrypt_key(const uint8_t *key, int key_bits,
                                    AES_KEY *out) {
   static const uint32_t kRCon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
@@ -431,7 +427,7 @@
   for (size_t i = words; i < num_subkey_words; i++) {
     uint32_t tmp = out->rd_key[i - 1];
     if (i % words == 0) {
-      tmp = aes_ref_sub_word(aes_ref_rot_word(tmp, 8)) ^ kRCon[(i / words) - 1];
+      tmp = aes_ref_sub_word(CRYPTO_rotr_u32(tmp, 8)) ^ kRCon[(i / words) - 1];
     } else if (key_bits == 256 && i % 4 == 0) {
       tmp = aes_ref_sub_word(tmp);
     }
@@ -532,9 +528,9 @@
   for (size_t i = 0; i < 4; i++) {
     uint32_t in = block[i];
     block[i] = kInvMixColumn[in >> 24];
-    block[i] ^= aes_ref_rot_word(kInvMixColumn[(in >> 16) & 0xff], 8);
-    block[i] ^= aes_ref_rot_word(kInvMixColumn[(in >> 8) & 0xff], 16);
-    block[i] ^= aes_ref_rot_word(kInvMixColumn[in & 0xff], 24);
+    block[i] ^= CRYPTO_rotr_u32(kInvMixColumn[(in >> 16) & 0xff], 8);
+    block[i] ^= CRYPTO_rotr_u32(kInvMixColumn[(in >> 8) & 0xff], 16);
+    block[i] ^= CRYPTO_rotr_u32(kInvMixColumn[in & 0xff], 24);
   }
 }
 
diff --git a/crypto/fipsmodule/des/des.c b/crypto/fipsmodule/des/des.c
index 2b0fdcd..95c430c 100644
--- a/crypto/fipsmodule/des/des.c
+++ b/crypto/fipsmodule/des/des.c
@@ -342,10 +342,10 @@
 
     // table contained 0213 4657
     t2 = ((t << 16L) | (s & 0x0000ffffL)) & 0xffffffffL;
-    schedule->subkeys[i][0] = ROTATE(t2, 30) & 0xffffffffL;
+    schedule->subkeys[i][0] = CRYPTO_rotr_u32(t2, 30);
 
     t2 = ((s >> 16L) | (t & 0xffff0000L));
-    schedule->subkeys[i][1] = ROTATE(t2, 26) & 0xffffffffL;
+    schedule->subkeys[i][1] = CRYPTO_rotr_u32(t2, 26);
   }
 }
 
@@ -392,8 +392,8 @@
   // <71755.204@CompuServe.COM> for pointing this out.
   // clear the top bits on machines with 8byte longs
   // shift left by 2
-  r = ROTATE(r, 29) & 0xffffffffL;
-  l = ROTATE(l, 29) & 0xffffffffL;
+  r = CRYPTO_rotr_u32(r, 29);
+  l = CRYPTO_rotr_u32(l, 29);
 
   // I don't know if it is worth the effort of loop unrolling the
   // inner loop
@@ -434,8 +434,8 @@
   }
 
   // rotate and clear the top bits on machines with 8byte longs
-  l = ROTATE(l, 3) & 0xffffffffL;
-  r = ROTATE(r, 3) & 0xffffffffL;
+  l = CRYPTO_rotr_u32(l, 3);
+  r = CRYPTO_rotr_u32(r, 3);
 
   FP(r, l);
   data[0] = l;
@@ -454,8 +454,8 @@
   // sparc2. Thanks to Richard Outerbridge <71755.204@CompuServe.COM> for
   // pointing this out.
   // clear the top bits on machines with 8byte longs
-  r = ROTATE(r, 29) & 0xffffffffL;
-  l = ROTATE(l, 29) & 0xffffffffL;
+  r = CRYPTO_rotr_u32(r, 29);
+  l = CRYPTO_rotr_u32(l, 29);
 
   // I don't know if it is worth the effort of loop unrolling the
   // inner loop
@@ -495,8 +495,8 @@
     D_ENCRYPT(ks, r, l, 0);
   }
   // rotate and clear the top bits on machines with 8byte longs
-  data[0] = ROTATE(l, 3) & 0xffffffffL;
-  data[1] = ROTATE(r, 3) & 0xffffffffL;
+  data[0] = CRYPTO_rotr_u32(l, 3);
+  data[1] = CRYPTO_rotr_u32(r, 3);
 }
 
 void DES_encrypt3(uint32_t *data, const DES_key_schedule *ks1,
@@ -782,4 +782,3 @@
 #undef D_ENCRYPT
 #undef ITERATIONS
 #undef HALF_ITERATIONS
-#undef ROTATE
diff --git a/crypto/fipsmodule/des/internal.h b/crypto/fipsmodule/des/internal.h
index 1ae3f22..3e3992e 100644
--- a/crypto/fipsmodule/des/internal.h
+++ b/crypto/fipsmodule/des/internal.h
@@ -218,7 +218,7 @@
 #define D_ENCRYPT(ks, LL, R, S)                                                \
   do {                                                                         \
     LOAD_DATA(ks, R, S, u, t, E0, E1);                                         \
-    t = ROTATE(t, 4);                                                          \
+    t = CRYPTO_rotr_u32(t, 4);                                                 \
     (LL) ^=                                                                    \
         DES_SPtrans[0][(u >> 2L) & 0x3f] ^ DES_SPtrans[2][(u >> 10L) & 0x3f] ^ \
         DES_SPtrans[4][(u >> 18L) & 0x3f] ^                                    \
@@ -230,8 +230,6 @@
 #define ITERATIONS 16
 #define HALF_ITERATIONS 8
 
-#define ROTATE(a, n) (((a) >> (n)) + ((a) << (32 - (n))))
-
 
 #if defined(__cplusplus)
 }  // extern C
diff --git a/crypto/fipsmodule/md4/md4.c b/crypto/fipsmodule/md4/md4.c
index a505d05..5b44653 100644
--- a/crypto/fipsmodule/md4/md4.c
+++ b/crypto/fipsmodule/md4/md4.c
@@ -113,24 +113,22 @@
 #define G(b, c, d) (((b) & (c)) | ((b) & (d)) | ((c) & (d)))
 #define H(b, c, d) ((b) ^ (c) ^ (d))
 
-#define ROTATE(a, n) (((a) << (n)) | ((a) >> (32 - (n))))
-
 #define R0(a, b, c, d, k, s, t)            \
   do {                                     \
     (a) += ((k) + (t) + F((b), (c), (d))); \
-    (a) = ROTATE(a, s);                    \
+    (a) = CRYPTO_rotl_u32(a, s);           \
   } while (0)
 
 #define R1(a, b, c, d, k, s, t)            \
   do {                                     \
     (a) += ((k) + (t) + G((b), (c), (d))); \
-    (a) = ROTATE(a, s);                    \
+    (a) = CRYPTO_rotl_u32(a, s);           \
   } while (0)
 
 #define R2(a, b, c, d, k, s, t)            \
   do {                                     \
     (a) += ((k) + (t) + H((b), (c), (d))); \
-    (a) = ROTATE(a, s);                    \
+    (a) = CRYPTO_rotl_u32(a, s);           \
   } while (0)
 
 void md4_block_data_order(uint32_t *state, const uint8_t *data, size_t num) {
@@ -237,7 +235,6 @@
 #undef F
 #undef G
 #undef H
-#undef ROTATE
 #undef R0
 #undef R1
 #undef R2
diff --git a/crypto/fipsmodule/md5/md5.c b/crypto/fipsmodule/md5/md5.c
index eba34bc..1691526 100644
--- a/crypto/fipsmodule/md5/md5.c
+++ b/crypto/fipsmodule/md5/md5.c
@@ -119,33 +119,31 @@
 #define H(b, c, d) ((b) ^ (c) ^ (d))
 #define I(b, c, d) (((~(d)) | (b)) ^ (c))
 
-#define ROTATE(a, n) (((a) << (n)) | ((a) >> (32 - (n))))
-
 #define R0(a, b, c, d, k, s, t)            \
   do {                                     \
     (a) += ((k) + (t) + F((b), (c), (d))); \
-    (a) = ROTATE(a, s);                    \
+    (a) = CRYPTO_rotl_u32(a, s);           \
     (a) += (b);                            \
   } while (0)
 
 #define R1(a, b, c, d, k, s, t)            \
   do {                                     \
     (a) += ((k) + (t) + G((b), (c), (d))); \
-    (a) = ROTATE(a, s);                    \
+    (a) = CRYPTO_rotl_u32(a, s);           \
     (a) += (b);                            \
   } while (0)
 
 #define R2(a, b, c, d, k, s, t)            \
   do {                                     \
     (a) += ((k) + (t) + H((b), (c), (d))); \
-    (a) = ROTATE(a, s);                    \
+    (a) = CRYPTO_rotl_u32(a, s);           \
     (a) += (b);                            \
   } while (0)
 
 #define R3(a, b, c, d, k, s, t)            \
   do {                                     \
     (a) += ((k) + (t) + I((b), (c), (d))); \
-    (a) = ROTATE(a, s);                    \
+    (a) = CRYPTO_rotl_u32(a, s);           \
     (a) += (b);                            \
   } while (0)
 
@@ -280,7 +278,6 @@
 #undef G
 #undef H
 #undef I
-#undef ROTATE
 #undef R0
 #undef R1
 #undef R2
diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c
index c629308..e482c77 100644
--- a/crypto/fipsmodule/sha/sha1.c
+++ b/crypto/fipsmodule/sha/sha1.c
@@ -111,11 +111,10 @@
   return 1;
 }
 
-#define ROTATE(a, n) (((a) << (n)) | ((a) >> (32 - (n))))
-#define Xupdate(a, ix, ia, ib, ic, id) \
-  do {                                 \
-    (a) = ((ia) ^ (ib) ^ (ic) ^ (id)); \
-    (ix) = (a) = ROTATE((a), 1);       \
+#define Xupdate(a, ix, ia, ib, ic, id)    \
+  do {                                    \
+    (a) = ((ia) ^ (ib) ^ (ic) ^ (id));    \
+    (ix) = (a) = CRYPTO_rotl_u32((a), 1); \
   } while (0)
 
 #define K_00_19 0x5a827999UL
@@ -133,45 +132,47 @@
 #define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d)))
 #define F_60_79(b, c, d) F_20_39(b, c, d)
 
-#define BODY_00_15(i, a, b, c, d, e, f, xi)                               \
-  do {                                                                    \
-    (f) = (xi) + (e) + K_00_19 + ROTATE((a), 5) + F_00_19((b), (c), (d)); \
-    (b) = ROTATE((b), 30);                                                \
+#define BODY_00_15(i, a, b, c, d, e, f, xi)                \
+  do {                                                     \
+    (f) = (xi) + (e) + K_00_19 + CRYPTO_rotl_u32((a), 5) + \
+          F_00_19((b), (c), (d));                          \
+    (b) = CRYPTO_rotl_u32((b), 30);                        \
   } while (0)
 
-#define BODY_16_19(i, a, b, c, d, e, f, xi, xa, xb, xc, xd)         \
-  do {                                                              \
-    Xupdate(f, xi, xa, xb, xc, xd);                                 \
-    (f) += (e) + K_00_19 + ROTATE((a), 5) + F_00_19((b), (c), (d)); \
-    (b) = ROTATE((b), 30);                                          \
+#define BODY_16_19(i, a, b, c, d, e, f, xi, xa, xb, xc, xd)                  \
+  do {                                                                       \
+    Xupdate(f, xi, xa, xb, xc, xd);                                          \
+    (f) += (e) + K_00_19 + CRYPTO_rotl_u32((a), 5) + F_00_19((b), (c), (d)); \
+    (b) = CRYPTO_rotl_u32((b), 30);                                          \
   } while (0)
 
-#define BODY_20_31(i, a, b, c, d, e, f, xi, xa, xb, xc, xd)         \
-  do {                                                              \
-    Xupdate(f, xi, xa, xb, xc, xd);                                 \
-    (f) += (e) + K_20_39 + ROTATE((a), 5) + F_20_39((b), (c), (d)); \
-    (b) = ROTATE((b), 30);                                          \
+#define BODY_20_31(i, a, b, c, d, e, f, xi, xa, xb, xc, xd)                  \
+  do {                                                                       \
+    Xupdate(f, xi, xa, xb, xc, xd);                                          \
+    (f) += (e) + K_20_39 + CRYPTO_rotl_u32((a), 5) + F_20_39((b), (c), (d)); \
+    (b) = CRYPTO_rotl_u32((b), 30);                                          \
   } while (0)
 
-#define BODY_32_39(i, a, b, c, d, e, f, xa, xb, xc, xd)             \
-  do {                                                              \
-    Xupdate(f, xa, xa, xb, xc, xd);                                 \
-    (f) += (e) + K_20_39 + ROTATE((a), 5) + F_20_39((b), (c), (d)); \
-    (b) = ROTATE((b), 30);                                          \
+#define BODY_32_39(i, a, b, c, d, e, f, xa, xb, xc, xd)                      \
+  do {                                                                       \
+    Xupdate(f, xa, xa, xb, xc, xd);                                          \
+    (f) += (e) + K_20_39 + CRYPTO_rotl_u32((a), 5) + F_20_39((b), (c), (d)); \
+    (b) = CRYPTO_rotl_u32((b), 30);                                          \
   } while (0)
 
-#define BODY_40_59(i, a, b, c, d, e, f, xa, xb, xc, xd)             \
-  do {                                                              \
-    Xupdate(f, xa, xa, xb, xc, xd);                                 \
-    (f) += (e) + K_40_59 + ROTATE((a), 5) + F_40_59((b), (c), (d)); \
-    (b) = ROTATE((b), 30);                                          \
+#define BODY_40_59(i, a, b, c, d, e, f, xa, xb, xc, xd)                      \
+  do {                                                                       \
+    Xupdate(f, xa, xa, xb, xc, xd);                                          \
+    (f) += (e) + K_40_59 + CRYPTO_rotl_u32((a), 5) + F_40_59((b), (c), (d)); \
+    (b) = CRYPTO_rotl_u32((b), 30);                                          \
   } while (0)
 
-#define BODY_60_79(i, a, b, c, d, e, f, xa, xb, xc, xd)                   \
-  do {                                                                    \
-    Xupdate(f, xa, xa, xb, xc, xd);                                       \
-    (f) = (xa) + (e) + K_60_79 + ROTATE((a), 5) + F_60_79((b), (c), (d)); \
-    (b) = ROTATE((b), 30);                                                \
+#define BODY_60_79(i, a, b, c, d, e, f, xa, xb, xc, xd)    \
+  do {                                                     \
+    Xupdate(f, xa, xa, xb, xc, xd);                        \
+    (f) = (xa) + (e) + K_60_79 + CRYPTO_rotl_u32((a), 5) + \
+          F_60_79((b), (c), (d));                          \
+    (b) = CRYPTO_rotl_u32((b), 30);                        \
   } while (0)
 
 #ifdef X
@@ -338,7 +339,6 @@
 }
 #endif
 
-#undef ROTATE
 #undef Xupdate
 #undef K_00_19
 #undef K_20_39
diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c
index 4394f4a..c187c4a 100644
--- a/crypto/fipsmodule/sha/sha256.c
+++ b/crypto/fipsmodule/sha/sha256.c
@@ -184,15 +184,17 @@
     0x682e6ff3UL, 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
     0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL};
 
-#define ROTATE(a, n) (((a) << (n)) | ((a) >> (32 - (n))))
-
-// FIPS specification refers to right rotations, while our ROTATE macro
-// is left one. This is why you might notice that rotation coefficients
-// differ from those observed in FIPS document by 32-N...
-#define Sigma0(x) (ROTATE((x), 30) ^ ROTATE((x), 19) ^ ROTATE((x), 10))
-#define Sigma1(x) (ROTATE((x), 26) ^ ROTATE((x), 21) ^ ROTATE((x), 7))
-#define sigma0(x) (ROTATE((x), 25) ^ ROTATE((x), 14) ^ ((x) >> 3))
-#define sigma1(x) (ROTATE((x), 15) ^ ROTATE((x), 13) ^ ((x) >> 10))
+// See FIPS 180-4, section 4.1.2.
+#define Sigma0(x)                                       \
+  (CRYPTO_rotr_u32((x), 2) ^ CRYPTO_rotr_u32((x), 13) ^ \
+   CRYPTO_rotr_u32((x), 22))
+#define Sigma1(x)                                       \
+  (CRYPTO_rotr_u32((x), 6) ^ CRYPTO_rotr_u32((x), 11) ^ \
+   CRYPTO_rotr_u32((x), 25))
+#define sigma0(x) \
+  (CRYPTO_rotr_u32((x), 7) ^ CRYPTO_rotr_u32((x), 18) ^ ((x) >> 3))
+#define sigma1(x) \
+  (CRYPTO_rotr_u32((x), 17) ^ CRYPTO_rotr_u32((x), 19) ^ ((x) >> 10))
 
 #define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
 #define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
@@ -309,7 +311,6 @@
   sha256_block_data_order(state, data, num_blocks);
 }
 
-#undef ROTATE
 #undef Sigma0
 #undef Sigma1
 #undef sigma0
diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c
index befdd52..d94de28 100644
--- a/crypto/fipsmodule/sha/sha512.c
+++ b/crypto/fipsmodule/sha/sha512.c
@@ -321,42 +321,16 @@
     UINT64_C(0x5fcb6fab3ad6faec), UINT64_C(0x6c44198c4a475817),
 };
 
-#if defined(__GNUC__) && __GNUC__ >= 2 && !defined(OPENSSL_NO_ASM)
-#if defined(__x86_64) || defined(__x86_64__)
-#define ROTR(a, n)                                              \
-  ({                                                            \
-    uint64_t ret;                                               \
-    __asm__("rorq %1, %0" : "=r"(ret) : "J"(n), "0"(a) : "cc"); \
-    ret;                                                        \
-  })
-#elif(defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
-#define ROTR(a, n)                                             \
-  ({                                                           \
-    uint64_t ret;                                              \
-    __asm__("rotrdi %0, %1, %2" : "=r"(ret) : "r"(a), "K"(n)); \
-    ret;                                                       \
-  })
-#elif defined(__aarch64__)
-#define ROTR(a, n)                                          \
-  ({                                                        \
-    uint64_t ret;                                           \
-    __asm__("ror %0, %1, %2" : "=r"(ret) : "r"(a), "I"(n)); \
-    ret;                                                    \
-  })
-#endif
-#elif defined(_MSC_VER) && defined(_WIN64)
-#pragma intrinsic(_rotr64)
-#define ROTR(a, n) _rotr64((a), n)
-#endif
-
-#ifndef ROTR
-#define ROTR(x, s) (((x) >> s) | (x) << (64 - s))
-#endif
-
-#define Sigma0(x) (ROTR((x), 28) ^ ROTR((x), 34) ^ ROTR((x), 39))
-#define Sigma1(x) (ROTR((x), 14) ^ ROTR((x), 18) ^ ROTR((x), 41))
-#define sigma0(x) (ROTR((x), 1) ^ ROTR((x), 8) ^ ((x) >> 7))
-#define sigma1(x) (ROTR((x), 19) ^ ROTR((x), 61) ^ ((x) >> 6))
+#define Sigma0(x)                                        \
+  (CRYPTO_rotr_u64((x), 28) ^ CRYPTO_rotr_u64((x), 34) ^ \
+   CRYPTO_rotr_u64((x), 39))
+#define Sigma1(x)                                        \
+  (CRYPTO_rotr_u64((x), 14) ^ CRYPTO_rotr_u64((x), 18) ^ \
+   CRYPTO_rotr_u64((x), 41))
+#define sigma0(x) \
+  (CRYPTO_rotr_u64((x), 1) ^ CRYPTO_rotr_u64((x), 8) ^ ((x) >> 7))
+#define sigma1(x) \
+  (CRYPTO_rotr_u64((x), 19) ^ CRYPTO_rotr_u64((x), 61) ^ ((x) >> 6))
 
 #define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
 #define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
@@ -524,7 +498,6 @@
 
 #endif  // !SHA512_ASM
 
-#undef ROTR
 #undef Sigma0
 #undef Sigma1
 #undef sigma0
diff --git a/crypto/internal.h b/crypto/internal.h
index 03bb779..41c42dd 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -890,6 +890,45 @@
 }
 
 
+// Bit rotation functions.
+//
+// Note these functions use |(-shift) & 31|, etc., because shifting by the bit
+// width is undefined. Both Clang and GCC recognize this pattern as a rotation,
+// but MSVC does not. Instead, we call MSVC's built-in functions.
+
+static inline uint32_t CRYPTO_rotl_u32(uint32_t value, int shift) {
+#if defined(_MSC_VER)
+  return _rotl(value, shift);
+#else
+  return (value << shift) | (value >> ((-shift) & 31));
+#endif
+}
+
+static inline uint32_t CRYPTO_rotr_u32(uint32_t value, int shift) {
+#if defined(_MSC_VER)
+  return _rotr(value, shift);
+#else
+  return (value >> shift) | (value << ((-shift) & 31));
+#endif
+}
+
+static inline uint64_t CRYPTO_rotl_u64(uint64_t value, int shift) {
+#if defined(_MSC_VER)
+  return _rotl64(value, shift);
+#else
+  return (value << shift) | (value >> ((-shift) & 63));
+#endif
+}
+
+static inline uint64_t CRYPTO_rotr_u64(uint64_t value, int shift) {
+#if defined(_MSC_VER)
+  return _rotr64(value, shift);
+#else
+  return (value >> shift) | (value << ((-shift) & 63));
+#endif
+}
+
+
 // FIPS functions.
 
 #if defined(BORINGSSL_FIPS)
diff --git a/crypto/siphash/siphash.c b/crypto/siphash/siphash.c
index f55c3ca..bb9a0c1 100644
--- a/crypto/siphash/siphash.c
+++ b/crypto/siphash/siphash.c
@@ -23,18 +23,18 @@
 static void siphash_round(uint64_t v[4]) {
   v[0] += v[1];
   v[2] += v[3];
-  v[1] = (v[1] << 13) | (v[1] >> (64 - 13));
-  v[3] = (v[3] << 16) | (v[3] >> (64 - 16));
+  v[1] = CRYPTO_rotl_u64(v[1], 13);
+  v[3] = CRYPTO_rotl_u64(v[3], 16);
   v[1] ^= v[0];
   v[3] ^= v[2];
-  v[0] = (v[0] << 32) | (v[0] >> 32);
+  v[0] = CRYPTO_rotl_u64(v[0], 32);
   v[2] += v[1];
   v[0] += v[3];
-  v[1] = (v[1] << 17) | (v[1] >> (64 - 17));
-  v[3] = (v[3] << 21) | (v[3] >> (64 - 21));
+  v[1] = CRYPTO_rotl_u64(v[1], 17);
+  v[3] = CRYPTO_rotl_u64(v[3], 21);
   v[1] ^= v[2];
   v[3] ^= v[0];
-  v[2] = (v[2] << 32) | (v[2] >> 32);
+  v[2] = CRYPTO_rotl_u64(v[2], 32);
 }
 
 uint64_t SIPHASH_24(const uint64_t key[2], const uint8_t *input,
diff --git a/decrepit/cast/cast.c b/decrepit/cast/cast.c
index 8fd4e3a..dffee5c 100644
--- a/decrepit/cast/cast.c
+++ b/decrepit/cast/cast.c
@@ -84,22 +84,16 @@
   l2n(d[1], out);
 }
 
-#if defined(OPENSSL_WINDOWS) && defined(_MSC_VER)
-#define ROTL(a, n) (_lrotl(a, n))
-#else
-#define ROTL(a, n) ((((a) << (n)) | ((a) >> ((-(n))&31))) & 0xffffffffL)
-#endif
-
-#define E_CAST(n, key, L, R, OP1, OP2, OP3)                                   \
-  {                                                                           \
-    uint32_t a, b, c, d;                                                      \
-    t = (key[n * 2] OP1 R) & 0xffffffff;                                      \
-    t = ROTL(t, (key[n * 2 + 1]));                                            \
-    a = CAST_S_table0[(t >> 8) & 0xff];                                       \
-    b = CAST_S_table1[(t)&0xff];                                              \
-    c = CAST_S_table2[(t >> 24) & 0xff];                                      \
-    d = CAST_S_table3[(t >> 16) & 0xff];                                      \
-    L ^= (((((a OP2 b)&0xffffffffL)OP3 c) & 0xffffffffL)OP1 d) & 0xffffffffL; \
+#define E_CAST(n, key, L, R, OP1, OP2, OP3)                                    \
+  {                                                                            \
+    uint32_t a, b, c, d;                                                       \
+    t = (key[n * 2] OP1 R) & 0xffffffff;                                       \
+    t = CRYPTO_rotl_u32(t, (key[n * 2 + 1]));                                  \
+    a = CAST_S_table0[(t >> 8) & 0xff];                                        \
+    b = CAST_S_table1[(t)&0xff];                                               \
+    c = CAST_S_table2[(t >> 24) & 0xff];                                       \
+    d = CAST_S_table3[(t >> 16) & 0xff];                                       \
+    L ^= (((((a OP2 b)&0xffffffffL)OP3 c) & 0xffffffffL) OP1 d) & 0xffffffffL; \
   }
 
 void CAST_encrypt(uint32_t *data, const CAST_KEY *key) {
diff --git a/decrepit/ripemd/ripemd.c b/decrepit/ripemd/ripemd.c
index 9120cdd..3ae6904 100644
--- a/decrepit/ripemd/ripemd.c
+++ b/decrepit/ripemd/ripemd.c
@@ -112,41 +112,39 @@
 #define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
 #define F5(x, y, z) (((~(z)) | (y)) ^ (x))
 
-#define ROTATE(a, n) (((a) << (n)) | (((a)&0xffffffff) >> (32 - (n))))
-
-#define RIP1(a, b, c, d, e, w, s) \
-  {                               \
-    a += F1(b, c, d) + X(w);      \
-    a = ROTATE(a, s) + e;         \
-    c = ROTATE(c, 10);            \
+#define RIP1(a, b, c, d, e, w, s)  \
+  {                                \
+    a += F1(b, c, d) + X(w);       \
+    a = CRYPTO_rotl_u32(a, s) + e; \
+    c = CRYPTO_rotl_u32(c, 10);    \
   }
 
 #define RIP2(a, b, c, d, e, w, s, K) \
   {                                  \
     a += F2(b, c, d) + X(w) + K;     \
-    a = ROTATE(a, s) + e;            \
-    c = ROTATE(c, 10);               \
+    a = CRYPTO_rotl_u32(a, s) + e;   \
+    c = CRYPTO_rotl_u32(c, 10);      \
   }
 
 #define RIP3(a, b, c, d, e, w, s, K) \
   {                                  \
     a += F3(b, c, d) + X(w) + K;     \
-    a = ROTATE(a, s) + e;            \
-    c = ROTATE(c, 10);               \
+    a = CRYPTO_rotl_u32(a, s) + e;   \
+    c = CRYPTO_rotl_u32(c, 10);      \
   }
 
 #define RIP4(a, b, c, d, e, w, s, K) \
   {                                  \
     a += F4(b, c, d) + X(w) + K;     \
-    a = ROTATE(a, s) + e;            \
-    c = ROTATE(c, 10);               \
+    a = CRYPTO_rotl_u32(a, s) + e;   \
+    c = CRYPTO_rotl_u32(c, 10);      \
   }
 
 #define RIP5(a, b, c, d, e, w, s, K) \
   {                                  \
     a += F5(b, c, d) + X(w) + K;     \
-    a = ROTATE(a, s) + e;            \
-    c = ROTATE(c, 10);               \
+    a = CRYPTO_rotl_u32(a, s) + e;   \
+    c = CRYPTO_rotl_u32(c, 10);      \
   }
 
 #define KL0 0x00000000L