Rename BSWAP[48] to CRYPTO_bswap[48] and always define them.

Previously, gcm.c contained a lot of workarounds for cases where BSWAP8
wasn't defined. Rather than handle this in each place, just make it
always available.

While we're here, make these macros inline functions instead and rename
them to something less likely to collide.

Change-Id: I9f2602f8b9965c63a86b177a8a084afb8b53a253
Reviewed-on: https://boringssl-review.googlesource.com/12479
Commit-Queue: Adam Langley <alangley@gmail.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/modes/cbc.c b/crypto/modes/cbc.c
index e41f2b4..6e9fe24 100644
--- a/crypto/modes/cbc.c
+++ b/crypto/modes/cbc.c
@@ -52,10 +52,6 @@
 #include "internal.h"
 
 
-#ifndef STRICT_ALIGNMENT
-#  define STRICT_ALIGNMENT 0
-#endif
-
 void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                            const void *key, uint8_t ivec[16],
                            block128_f block) {
diff --git a/crypto/modes/gcm.c b/crypto/modes/gcm.c
index c6b850a..d2e24c3 100644
--- a/crypto/modes/gcm.c
+++ b/crypto/modes/gcm.c
@@ -65,14 +65,6 @@
 #define GHASH_ASM
 #endif
 
-#if defined(BSWAP4) && STRICT_ALIGNMENT == 1
-/* redefine, because alignment is ensured */
-#undef GETU32
-#define GETU32(p) BSWAP4(*(const uint32_t *)(p))
-#undef PUTU32
-#define PUTU32(p, v) *(uint32_t *)(p) = BSWAP4(v)
-#endif
-
 #define PACK(s) ((size_t)(s) << (sizeof(size_t) * 8 - 16))
 #define REDUCE1BIT(V)                                                 \
   do {                                                                \
@@ -182,21 +174,8 @@
     Z.lo ^= Htable[nlo].lo;
   }
 
-#ifdef BSWAP8
-  Xi[0] = BSWAP8(Z.hi);
-  Xi[1] = BSWAP8(Z.lo);
-#else
-  uint8_t *p = (uint8_t *)Xi;
-  uint32_t v;
-  v = (uint32_t)(Z.hi >> 32);
-  PUTU32(p, v);
-  v = (uint32_t)(Z.hi);
-  PUTU32(p + 4, v);
-  v = (uint32_t)(Z.lo >> 32);
-  PUTU32(p + 8, v);
-  v = (uint32_t)(Z.lo);
-  PUTU32(p + 12, v);
-#endif
+  Xi[0] = CRYPTO_bswap8(Z.hi);
+  Xi[1] = CRYPTO_bswap8(Z.lo);
 }
 
 /* Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
@@ -255,21 +234,8 @@
       Z.lo ^= Htable[nlo].lo;
     }
 
-#ifdef BSWAP8
-    Xi[0] = BSWAP8(Z.hi);
-    Xi[1] = BSWAP8(Z.lo);
-#else
-    uint8_t *p = (uint8_t *)Xi;
-    uint32_t v;
-    v = (uint32_t)(Z.hi >> 32);
-    PUTU32(p, v);
-    v = (uint32_t)(Z.hi);
-    PUTU32(p + 4, v);
-    v = (uint32_t)(Z.lo >> 32);
-    PUTU32(p + 8, v);
-    v = (uint32_t)(Z.lo);
-    PUTU32(p + 12, v);
-#endif
+    Xi[0] = CRYPTO_bswap8(Z.hi);
+    Xi[1] = CRYPTO_bswap8(Z.lo);
   } while (inp += 16, len -= 16);
 }
 #else /* GHASH_ASM */
@@ -400,17 +366,8 @@
   memcpy(H.c, gcm_key, 16);
 
   /* H is stored in host byte order */
-#ifdef BSWAP8
-  H.u[0] = BSWAP8(H.u[0]);
-  H.u[1] = BSWAP8(H.u[1]);
-#else
-  uint8_t *p = H.c;
-  uint64_t hi, lo;
-  hi = (uint64_t)GETU32(p) << 32 | GETU32(p + 4);
-  lo = (uint64_t)GETU32(p + 8) << 32 | GETU32(p + 12);
-  H.u[0] = hi;
-  H.u[1] = lo;
-#endif
+  H.u[0] = CRYPTO_bswap8(H.u[0]);
+  H.u[1] = CRYPTO_bswap8(H.u[1]);
 
 #if defined(GHASH_ASM_X86_OR_64)
   if (crypto_gcm_clmul_enabled()) {
@@ -519,26 +476,15 @@
       GCM_MUL(ctx, Yi);
     }
     len0 <<= 3;
-#ifdef BSWAP8
-    ctx->Yi.u[1] ^= BSWAP8(len0);
-#else
-    ctx->Yi.c[8] ^= (uint8_t)(len0 >> 56);
-    ctx->Yi.c[9] ^= (uint8_t)(len0 >> 48);
-    ctx->Yi.c[10] ^= (uint8_t)(len0 >> 40);
-    ctx->Yi.c[11] ^= (uint8_t)(len0 >> 32);
-    ctx->Yi.c[12] ^= (uint8_t)(len0 >> 24);
-    ctx->Yi.c[13] ^= (uint8_t)(len0 >> 16);
-    ctx->Yi.c[14] ^= (uint8_t)(len0 >> 8);
-    ctx->Yi.c[15] ^= (uint8_t)(len0);
-#endif
+    ctx->Yi.u[1] ^= CRYPTO_bswap8(len0);
 
     GCM_MUL(ctx, Yi);
-    ctr = GETU32(ctx->Yi.c + 12);
+    ctr = GETU32_aligned(ctx->Yi.c + 12);
   }
 
   (*ctx->block)(ctx->Yi.c, ctx->EK0.c, key);
   ++ctr;
-  PUTU32(ctx->Yi.c + 12, ctr);
+  PUTU32_aligned(ctx->Yi.c + 12, ctr);
 }
 
 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) {
@@ -635,7 +581,7 @@
     ctx->ares = 0;
   }
 
-  ctr = GETU32(ctx->Yi.c + 12);
+  ctr = GETU32_aligned(ctx->Yi.c + 12);
 
   n = ctx->mres;
   if (n) {
@@ -656,7 +602,7 @@
       if (n == 0) {
         (*block)(ctx->Yi.c, ctx->EKi.c, key);
         ++ctr;
-        PUTU32(ctx->Yi.c + 12, ctr);
+        PUTU32_aligned(ctx->Yi.c + 12, ctr);
       }
       ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
       n = (n + 1) % 16;
@@ -678,7 +624,7 @@
 
       (*block)(ctx->Yi.c, ctx->EKi.c, key);
       ++ctr;
-      PUTU32(ctx->Yi.c + 12, ctr);
+      PUTU32_aligned(ctx->Yi.c + 12, ctr);
       for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
       }
@@ -697,7 +643,7 @@
 
       (*block)(ctx->Yi.c, ctx->EKi.c, key);
       ++ctr;
-      PUTU32(ctx->Yi.c + 12, ctr);
+      PUTU32_aligned(ctx->Yi.c + 12, ctr);
       for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
       }
@@ -714,7 +660,7 @@
 
     (*block)(ctx->Yi.c, ctx->EKi.c, key);
     ++ctr;
-    PUTU32(ctx->Yi.c + 12, ctr);
+    PUTU32_aligned(ctx->Yi.c + 12, ctr);
     for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
       ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
     }
@@ -727,7 +673,7 @@
   if (len) {
     (*block)(ctx->Yi.c, ctx->EKi.c, key);
     ++ctr;
-    PUTU32(ctx->Yi.c + 12, ctr);
+    PUTU32_aligned(ctx->Yi.c + 12, ctr);
     while (len--) {
       ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
       ++n;
@@ -765,7 +711,7 @@
     ctx->ares = 0;
   }
 
-  ctr = GETU32(ctx->Yi.c + 12);
+  ctr = GETU32_aligned(ctx->Yi.c + 12);
 
   n = ctx->mres;
   if (n) {
@@ -789,7 +735,7 @@
       if (n == 0) {
         (*block)(ctx->Yi.c, ctx->EKi.c, key);
         ++ctr;
-        PUTU32(ctx->Yi.c + 12, ctr);
+        PUTU32_aligned(ctx->Yi.c + 12, ctr);
       }
       c = in[i];
       out[i] = c ^ ctx->EKi.c[n];
@@ -814,7 +760,7 @@
 
       (*block)(ctx->Yi.c, ctx->EKi.c, key);
       ++ctr;
-      PUTU32(ctx->Yi.c + 12, ctr);
+      PUTU32_aligned(ctx->Yi.c + 12, ctr);
       for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
       }
@@ -833,7 +779,7 @@
 
       (*block)(ctx->Yi.c, ctx->EKi.c, key);
       ++ctr;
-      PUTU32(ctx->Yi.c + 12, ctr);
+      PUTU32_aligned(ctx->Yi.c + 12, ctr);
       for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
       }
@@ -849,7 +795,7 @@
 
     (*block)(ctx->Yi.c, ctx->EKi.c, key);
     ++ctr;
-    PUTU32(ctx->Yi.c + 12, ctr);
+    PUTU32_aligned(ctx->Yi.c + 12, ctr);
     for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
       size_t c = in_t[i];
       out_t[i] = c ^ ctx->EKi.t[i];
@@ -864,7 +810,7 @@
   if (len) {
     (*block)(ctx->Yi.c, ctx->EKi.c, key);
     ++ctr;
-    PUTU32(ctx->Yi.c + 12, ctr);
+    PUTU32_aligned(ctx->Yi.c + 12, ctr);
     while (len--) {
       uint8_t c = in[n];
       ctx->Xi.c[n] ^= c;
@@ -929,13 +875,13 @@
   }
 #endif
 
-  ctr = GETU32(ctx->Yi.c + 12);
+  ctr = GETU32_aligned(ctx->Yi.c + 12);
 
 #if defined(GHASH)
   while (len >= GHASH_CHUNK) {
     (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
     ctr += GHASH_CHUNK / 16;
-    PUTU32(ctx->Yi.c + 12, ctr);
+    PUTU32_aligned(ctx->Yi.c + 12, ctr);
     GHASH(ctx, out, GHASH_CHUNK);
     out += GHASH_CHUNK;
     in += GHASH_CHUNK;
@@ -948,7 +894,7 @@
 
     (*stream)(in, out, j, key, ctx->Yi.c);
     ctr += (unsigned int)j;
-    PUTU32(ctx->Yi.c + 12, ctr);
+    PUTU32_aligned(ctx->Yi.c + 12, ctr);
     in += i;
     len -= i;
 #if defined(GHASH)
@@ -967,7 +913,7 @@
   if (len) {
     (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
     ++ctr;
-    PUTU32(ctx->Yi.c + 12, ctr);
+    PUTU32_aligned(ctx->Yi.c + 12, ctr);
     while (len--) {
       ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
       ++n;
@@ -1032,14 +978,14 @@
   }
 #endif
 
-  ctr = GETU32(ctx->Yi.c + 12);
+  ctr = GETU32_aligned(ctx->Yi.c + 12);
 
 #if defined(GHASH)
   while (len >= GHASH_CHUNK) {
     GHASH(ctx, in, GHASH_CHUNK);
     (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
     ctr += GHASH_CHUNK / 16;
-    PUTU32(ctx->Yi.c + 12, ctr);
+    PUTU32_aligned(ctx->Yi.c + 12, ctr);
     out += GHASH_CHUNK;
     in += GHASH_CHUNK;
     len -= GHASH_CHUNK;
@@ -1065,7 +1011,7 @@
 #endif
     (*stream)(in, out, j, key, ctx->Yi.c);
     ctr += (unsigned int)j;
-    PUTU32(ctx->Yi.c + 12, ctr);
+    PUTU32_aligned(ctx->Yi.c + 12, ctr);
     out += i;
     in += i;
     len -= i;
@@ -1073,7 +1019,7 @@
   if (len) {
     (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
     ++ctr;
-    PUTU32(ctx->Yi.c + 12, ctr);
+    PUTU32_aligned(ctx->Yi.c + 12, ctr);
     while (len--) {
       uint8_t c = in[n];
       ctx->Xi.c[n] ^= c;
@@ -1097,18 +1043,8 @@
     GCM_MUL(ctx, Xi);
   }
 
-#ifdef BSWAP8
-  alen = BSWAP8(alen);
-  clen = BSWAP8(clen);
-#else
-  uint8_t *p = ctx->len.c;
-
-  ctx->len.u[0] = alen;
-  ctx->len.u[1] = clen;
-
-  alen = (uint64_t)GETU32(p) << 32 | GETU32(p + 4);
-  clen = (uint64_t)GETU32(p + 8) << 32 | GETU32(p + 12);
-#endif
+  alen = CRYPTO_bswap8(alen);
+  clen = CRYPTO_bswap8(clen);
 
   ctx->Xi.u[0] ^= alen;
   ctx->Xi.u[1] ^= clen;
diff --git a/crypto/modes/gcm_test.cc b/crypto/modes/gcm_test.cc
index 51d966e..8fee4e4 100644
--- a/crypto/modes/gcm_test.cc
+++ b/crypto/modes/gcm_test.cc
@@ -388,12 +388,22 @@
   return ret;
 }
 
+static bool TestByteSwap() {
+  return CRYPTO_bswap4(0x01020304) == 0x04030201 &&
+         CRYPTO_bswap8(UINT64_C(0x0102030405060708)) ==
+             UINT64_C(0x0807060504030201);
+}
+
 int main(void) {
   int ret = 0;
   unsigned i;
 
   CRYPTO_library_init();
 
+  if (!TestByteSwap()) {
+    ret = 1;
+  }
+
   for (i = 0; i < sizeof(test_cases) / sizeof(struct test_case); i++) {
     if (!run_test_case(i, &test_cases[i])) {
       ret = 1;
diff --git a/crypto/modes/internal.h b/crypto/modes/internal.h
index 94fd3f5..64232c4 100644
--- a/crypto/modes/internal.h
+++ b/crypto/modes/internal.h
@@ -51,6 +51,8 @@
 
 #include <openssl/base.h>
 
+#include <string.h>
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -64,90 +66,58 @@
 #define STRICT_ALIGNMENT 0
 #endif
 
-#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM)
 #if defined(__GNUC__) && __GNUC__ >= 2
-#if defined(OPENSSL_X86_64)
-#define BSWAP8(x)                 \
-  ({                              \
-    uint64_t ret = (x);           \
-    asm("bswapq %0" : "+r"(ret)); \
-    ret;                          \
-  })
-#define BSWAP4(x)                 \
-  ({                              \
-    uint32_t ret = (x);           \
-    asm("bswapl %0" : "+r"(ret)); \
-    ret;                          \
-  })
-#elif defined(OPENSSL_X86)
-#define BSWAP8(x)                                     \
-  ({                                                  \
-    uint32_t lo = (uint64_t)(x) >> 32, hi = (x);      \
-    asm("bswapl %0; bswapl %1" : "+r"(hi), "+r"(lo)); \
-    (uint64_t) hi << 32 | lo;                         \
-  })
-#define BSWAP4(x)                 \
-  ({                              \
-    uint32_t ret = (x);           \
-    asm("bswapl %0" : "+r"(ret)); \
-    ret;                          \
-  })
-#elif defined(OPENSSL_AARCH64)
-#define BSWAP8(x)                          \
-  ({                                       \
-    uint64_t ret;                          \
-    asm("rev %0,%1" : "=r"(ret) : "r"(x)); \
-    ret;                                   \
-  })
-#define BSWAP4(x)                            \
-  ({                                         \
-    uint32_t ret;                            \
-    asm("rev %w0,%w1" : "=r"(ret) : "r"(x)); \
-    ret;                                     \
-  })
-#elif defined(OPENSSL_ARM) && !defined(STRICT_ALIGNMENT)
-#define BSWAP8(x)                                     \
-  ({                                                  \
-    uint32_t lo = (uint64_t)(x) >> 32, hi = (x);      \
-    asm("rev %0,%0; rev %1,%1" : "+r"(hi), "+r"(lo)); \
-    (uint64_t) hi << 32 | lo;                         \
-  })
-#define BSWAP4(x)                                      \
-  ({                                                   \
-    uint32_t ret;                                      \
-    asm("rev %0,%1" : "=r"(ret) : "r"((uint32_t)(x))); \
-    ret;                                               \
-  })
-#endif
+static inline uint32_t CRYPTO_bswap4(uint32_t x) {
+  return __builtin_bswap32(x);
+}
+
+static inline uint64_t CRYPTO_bswap8(uint64_t x) {
+  return __builtin_bswap64(x);
+}
 #elif defined(_MSC_VER)
-#if _MSC_VER >= 1300
 OPENSSL_MSVC_PRAGMA(warning(push, 3))
 #include <intrin.h>
 OPENSSL_MSVC_PRAGMA(warning(pop))
 #pragma intrinsic(_byteswap_uint64, _byteswap_ulong)
-#define BSWAP8(x) _byteswap_uint64((uint64_t)(x))
-#define BSWAP4(x) _byteswap_ulong((uint32_t)(x))
-#elif defined(OPENSSL_X86)
-__inline uint32_t _bswap4(uint32_t val) {
-  _asm mov eax, val
-  _asm bswap eax
+static inline uint32_t CRYPTO_bswap4(uint32_t x) {
+  return _byteswap_ulong(x);
 }
-#define BSWAP4(x) _bswap4(x)
-#endif
-#endif
-#endif
 
-#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
-#define GETU32(p) BSWAP4(*(const uint32_t *)(p))
-#define PUTU32(p, v) *(uint32_t *)(p) = BSWAP4(v)
+static inline uint64_t CRYPTO_bswap8(uint64_t x) {
+  return _byteswap_uint64(x);
+}
 #else
-#define GETU32(p) \
-  ((uint32_t)(p)[0] << 24 | (uint32_t)(p)[1] << 16 | (uint32_t)(p)[2] << 8 | (uint32_t)(p)[3])
-#define PUTU32(p, v)                                   \
-  ((p)[0] = (uint8_t)((v) >> 24), (p)[1] = (uint8_t)((v) >> 16), \
-   (p)[2] = (uint8_t)((v) >> 8), (p)[3] = (uint8_t)(v))
+static inline uint32_t CRYPTO_bswap4(uint32_t x) {
+  x = (x >> 16) | (x << 16);
+  x = ((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8);
+  return x;
+}
+
+static inline uint64_t CRYPTO_bswap8(uint64_t x) {
+  return CRYPTO_bswap4(x >> 32) | (((uint64_t)CRYPTO_bswap4(x)) << 32);
+}
 #endif
 
+static inline uint32_t GETU32(const void *in) {
+  uint32_t v;
+  memcpy(&v, in, sizeof(v));
+  return CRYPTO_bswap4(v);
+}
+
+static inline void PUTU32(void *out, uint32_t v) {
+  v = CRYPTO_bswap4(v);
+  memcpy(out, &v, sizeof(v));
+}
+
+static inline uint32_t GETU32_aligned(const void *in) {
+  const char *alias = (const char *) in;
+  return CRYPTO_bswap4(*((const uint32_t *) alias));
+}
+
+static inline void PUTU32_aligned(void *in, uint32_t v) {
+  char *alias = (char *) in;
+  *((uint32_t *) alias) = CRYPTO_bswap4(v);
+}
 
 /* block128_f is the type of a 128-bit, block cipher. */
 typedef void (*block128_f)(const uint8_t in[16], uint8_t out[16],