Remove unions in GCM implementation

This was a bit of a mess. There are three assembly functions to juggle
here. Their current type signatures are:

 void gcm_init_v8(u128 Htable[16], const uint64_t H[2]);
 void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]);
 void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
                   size_t len);

Except for gcm_nohw.c, this is all assembly, so they don't follow the C
abstract machine's theory of typed memory. That means types are mostly
arbitrary and we have room to rearrange them. They do carry an implicit
alignment requirement, but none of these assembly files care about
this[*].

Values passed to gcm_gmult and gcm_ghash get XORed byte-by-byte in
places, which is inconvenient to do as uint64_t. They also get passed to
AES functions, which want bytes. Thus I think uint8_t[16] is the most
natural and convenient type to use.

H in gcm_init is interesting. gcm_init already doesn't take a GHASH key
in the natural byte representation. The two 8-byte halves are
byte-swapped, but the halves are not swapped, so it's not quite a byte
reversal. I opted to leave that as uint64_t[2], mostly to capture that
something odd is happening here.

[*] We only have GHASH assembly for x86, x86_64, armv7, and aarch64. We
used to have armv4 GHASH assembly, but that's been removed from
gcm_nohw.c. Thus we can assume none of these files care about alignment
for plain scalar loads. Alignment does matter for vmovdqa vs vmovdqu,
but that requires 16-byte alignment and uint64_t only implies 4- or
8-byte alignment on these architectures.

Bug: 574
Change-Id: If7dba9b41ff62204f4cf8fcd54eb4a4c54214c6e
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/59528
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/modes/cbc.c b/crypto/fipsmodule/modes/cbc.c
index df8f9ce..511a5de 100644
--- a/crypto/fipsmodule/modes/cbc.c
+++ b/crypto/fipsmodule/modes/cbc.c
@@ -66,10 +66,7 @@
   size_t n;
   const uint8_t *iv = ivec;
   while (len >= 16) {
-    for (n = 0; n < 16; n += sizeof(crypto_word_t)) {
-      CRYPTO_store_word_le(
-          out + n, CRYPTO_load_word_le(in + n) ^ CRYPTO_load_word_le(iv + n));
-    }
+    CRYPTO_xor16(out, in, iv);
     (*block)(out, out, key);
     iv = out;
     len -= 16;
@@ -118,15 +115,10 @@
   if ((inptr >= 32 && outptr <= inptr - 32) || inptr < outptr) {
     // If |out| is at least two blocks behind |in| or completely disjoint, there
     // is no need to decrypt to a temporary block.
-    static_assert(16 % sizeof(crypto_word_t) == 0,
-                  "block cannot be evenly divided into words");
     const uint8_t *iv = ivec;
     while (len >= 16) {
       (*block)(in, out, key);
-      for (n = 0; n < 16; n += sizeof(crypto_word_t)) {
-        CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(out + n) ^
-                                          CRYPTO_load_word_le(iv + n));
-      }
+      CRYPTO_xor16(out, out, iv);
       iv = in;
       len -= 16;
       in += 16;
diff --git a/crypto/fipsmodule/modes/ctr.c b/crypto/fipsmodule/modes/ctr.c
index 1688f82..8c333bb 100644
--- a/crypto/fipsmodule/modes/ctr.c
+++ b/crypto/fipsmodule/modes/ctr.c
@@ -101,10 +101,7 @@
   while (len >= 16) {
     (*block)(ivec, ecount_buf, key);
     ctr128_inc(ivec);
-    for (n = 0; n < 16; n += sizeof(crypto_word_t)) {
-      CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(in + n) ^
-                                        CRYPTO_load_word_le(ecount_buf + n));
-    }
+    CRYPTO_xor16(out, in, ecount_buf);
     len -= 16;
     out += 16;
     in += 16;
diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c
index 40ec0c8..8413951 100644
--- a/crypto/fipsmodule/modes/gcm.c
+++ b/crypto/fipsmodule/modes/gcm.c
@@ -62,9 +62,9 @@
 static const size_t kSizeTWithoutLower4Bits = (size_t) -16;
 
 
-#define GCM_MUL(ctx, Xi) gcm_gmult_nohw((ctx)->Xi.u, (ctx)->gcm_key.Htable)
+#define GCM_MUL(ctx, Xi) gcm_gmult_nohw((ctx)->Xi, (ctx)->gcm_key.Htable)
 #define GHASH(ctx, in, len) \
-  gcm_ghash_nohw((ctx)->Xi.u, (ctx)->gcm_key.Htable, in, len)
+  gcm_ghash_nohw((ctx)->Xi, (ctx)->gcm_key.Htable, in, len)
 // GHASH_CHUNK is "stride parameter" missioned to mitigate cache
 // trashing effect. In other words idea is to hash data while it's
 // still in L1 cache after encryption pass...
@@ -126,22 +126,22 @@
 
 #ifdef GCM_FUNCREF
 #undef GCM_MUL
-#define GCM_MUL(ctx, Xi) (*gcm_gmult_p)((ctx)->Xi.u, (ctx)->gcm_key.Htable)
+#define GCM_MUL(ctx, Xi) (*gcm_gmult_p)((ctx)->Xi, (ctx)->gcm_key.Htable)
 #undef GHASH
 #define GHASH(ctx, in, len) \
-  (*gcm_ghash_p)((ctx)->Xi.u, (ctx)->gcm_key.Htable, in, len)
+  (*gcm_ghash_p)((ctx)->Xi, (ctx)->gcm_key.Htable, in, len)
 #endif  // GCM_FUNCREF
 
 #if defined(HW_GCM) && defined(OPENSSL_X86_64)
 static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
-                             const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi,
-                             const u128 Htable[16]) {
+                             const AES_KEY *key, uint8_t ivec[16],
+                             uint8_t Xi[16], const u128 Htable[16]) {
   return aesni_gcm_encrypt(in, out, len, key, ivec, Htable, Xi);
 }
 
 static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
-                             const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi,
-                             const u128 Htable[16]) {
+                             const AES_KEY *key, uint8_t ivec[16],
+                             uint8_t Xi[16], const u128 Htable[16]) {
   return aesni_gcm_decrypt(in, out, len, key, ivec, Htable, Xi);
 }
 #endif  // HW_GCM && X86_64
@@ -149,8 +149,8 @@
 #if defined(HW_GCM) && defined(OPENSSL_AARCH64)
 
 static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
-                             const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi,
-                             const u128 Htable[16]) {
+                             const AES_KEY *key, uint8_t ivec[16],
+                             uint8_t Xi[16], const u128 Htable[16]) {
   const size_t len_blocks = len & kSizeTWithoutLower4Bits;
   if (!len_blocks) {
     return 0;
@@ -160,8 +160,8 @@
 }
 
 static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
-                             const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi,
-                             const u128 Htable[16]) {
+                             const AES_KEY *key, uint8_t ivec[16],
+                             uint8_t Xi[16], const u128 Htable[16]) {
   const size_t len_blocks = len & kSizeTWithoutLower4Bits;
   if (!len_blocks) {
     return 0;
@@ -177,7 +177,7 @@
                        const uint8_t gcm_key[16]) {
   *out_is_avx = 0;
 
-  // H is stored in host byte order.
+  // H is passed to |gcm_init_*| as a pair of byte-swapped, 64-bit values.
   uint64_t H[2] = {CRYPTO_load_u64_be(gcm_key),
                    CRYPTO_load_u64_be(gcm_key + 8)};
 
@@ -258,75 +258,75 @@
 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const AES_KEY *key,
                          const uint8_t *iv, size_t len) {
 #ifdef GCM_FUNCREF
-  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
       ctx->gcm_key.gmult;
 #endif
 
-  ctx->Yi.u[0] = 0;
-  ctx->Yi.u[1] = 0;
-  ctx->Xi.u[0] = 0;
-  ctx->Xi.u[1] = 0;
-  ctx->len.u[0] = 0;  // AAD length
-  ctx->len.u[1] = 0;  // message length
+  OPENSSL_memset(&ctx->Yi, 0, sizeof(ctx->Yi));
+  OPENSSL_memset(&ctx->Xi, 0, sizeof(ctx->Xi));
+  ctx->len.aad = 0;
+  ctx->len.msg = 0;
   ctx->ares = 0;
   ctx->mres = 0;
 
   uint32_t ctr;
   if (len == 12) {
-    OPENSSL_memcpy(ctx->Yi.c, iv, 12);
-    ctx->Yi.c[15] = 1;
+    OPENSSL_memcpy(ctx->Yi, iv, 12);
+    ctx->Yi[15] = 1;
     ctr = 1;
   } else {
     uint64_t len0 = len;
 
     while (len >= 16) {
-      for (size_t i = 0; i < 16; ++i) {
-        ctx->Yi.c[i] ^= iv[i];
-      }
+      CRYPTO_xor16(ctx->Yi, ctx->Yi, iv);
       GCM_MUL(ctx, Yi);
       iv += 16;
       len -= 16;
     }
     if (len) {
       for (size_t i = 0; i < len; ++i) {
-        ctx->Yi.c[i] ^= iv[i];
+        ctx->Yi[i] ^= iv[i];
       }
       GCM_MUL(ctx, Yi);
     }
-    len0 <<= 3;
-    ctx->Yi.u[1] ^= CRYPTO_bswap8(len0);
+
+    uint8_t len_block[16];
+    OPENSSL_memset(len_block, 0, 8);
+    CRYPTO_store_u64_be(len_block + 8, len0 << 3);
+    CRYPTO_xor16(ctx->Yi, ctx->Yi, len_block);
 
     GCM_MUL(ctx, Yi);
-    ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
+    ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
   }
 
-  (*ctx->gcm_key.block)(ctx->Yi.c, ctx->EK0.c, key);
+  (*ctx->gcm_key.block)(ctx->Yi, ctx->EK0, key);
   ++ctr;
-  ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
+  CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
 }
 
 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) {
 #ifdef GCM_FUNCREF
-  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
       ctx->gcm_key.gmult;
-  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+  void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
                       size_t len) = ctx->gcm_key.ghash;
 #endif
 
-  if (ctx->len.u[1]) {
+  if (ctx->len.msg != 0) {
+    // The caller must have finished the AAD before providing other input.
     return 0;
   }
 
-  uint64_t alen = ctx->len.u[0] + len;
+  uint64_t alen = ctx->len.aad + len;
   if (alen > (UINT64_C(1) << 61) || (sizeof(len) == 8 && alen < len)) {
     return 0;
   }
-  ctx->len.u[0] = alen;
+  ctx->len.aad = alen;
 
   unsigned n = ctx->ares;
   if (n) {
     while (n && len) {
-      ctx->Xi.c[n] ^= *(aad++);
+      ctx->Xi[n] ^= *(aad++);
       --len;
       n = (n + 1) % 16;
     }
@@ -350,7 +350,7 @@
   if (len != 0) {
     n = (unsigned int)len;
     for (size_t i = 0; i < len; ++i) {
-      ctx->Xi.c[i] ^= aad[i];
+      ctx->Xi[i] ^= aad[i];
     }
   }
 
@@ -362,18 +362,18 @@
                           const uint8_t *in, uint8_t *out, size_t len) {
   block128_f block = ctx->gcm_key.block;
 #ifdef GCM_FUNCREF
-  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
       ctx->gcm_key.gmult;
-  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+  void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
                       size_t len) = ctx->gcm_key.ghash;
 #endif
 
-  uint64_t mlen = ctx->len.u[1] + len;
+  uint64_t mlen = ctx->len.msg + len;
   if (mlen > ((UINT64_C(1) << 36) - 32) ||
       (sizeof(len) == 8 && mlen < len)) {
     return 0;
   }
-  ctx->len.u[1] = mlen;
+  ctx->len.msg = mlen;
 
   if (ctx->ares) {
     // First call to encrypt finalizes GHASH(AAD)
@@ -384,7 +384,7 @@
   unsigned n = ctx->mres;
   if (n) {
     while (n && len) {
-      ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
+      ctx->Xi[n] ^= *(out++) = *(in++) ^ ctx->EKi[n];
       --len;
       n = (n + 1) % 16;
     }
@@ -396,19 +396,15 @@
     }
   }
 
-  uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
+  uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
   while (len >= GHASH_CHUNK) {
     size_t j = GHASH_CHUNK;
 
     while (j) {
-      (*block)(ctx->Yi.c, ctx->EKi.c, key);
+      (*block)(ctx->Yi, ctx->EKi, key);
       ++ctr;
-      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) {
-        CRYPTO_store_word_le(out + i,
-                             CRYPTO_load_word_le(in + i) ^
-                                 ctx->EKi.t[i / sizeof(crypto_word_t)]);
-      }
+      CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+      CRYPTO_xor16(out, in, ctx->EKi);
       out += 16;
       in += 16;
       j -= 16;
@@ -419,14 +415,10 @@
   size_t len_blocks = len & kSizeTWithoutLower4Bits;
   if (len_blocks != 0) {
     while (len >= 16) {
-      (*block)(ctx->Yi.c, ctx->EKi.c, key);
+      (*block)(ctx->Yi, ctx->EKi, key);
       ++ctr;
-      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) {
-        CRYPTO_store_word_le(out + i,
-                             CRYPTO_load_word_le(in + i) ^
-                                 ctx->EKi.t[i / sizeof(crypto_word_t)]);
-      }
+      CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+      CRYPTO_xor16(out, in, ctx->EKi);
       out += 16;
       in += 16;
       len -= 16;
@@ -434,11 +426,11 @@
     GHASH(ctx, out - len_blocks, len_blocks);
   }
   if (len) {
-    (*block)(ctx->Yi.c, ctx->EKi.c, key);
+    (*block)(ctx->Yi, ctx->EKi, key);
     ++ctr;
-    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
     while (len--) {
-      ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
+      ctx->Xi[n] ^= out[n] = in[n] ^ ctx->EKi[n];
       ++n;
     }
   }
@@ -452,18 +444,18 @@
                           size_t len) {
   block128_f block = ctx->gcm_key.block;
 #ifdef GCM_FUNCREF
-  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
       ctx->gcm_key.gmult;
-  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+  void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
                       size_t len) = ctx->gcm_key.ghash;
 #endif
 
-  uint64_t mlen = ctx->len.u[1] + len;
+  uint64_t mlen = ctx->len.msg + len;
   if (mlen > ((UINT64_C(1) << 36) - 32) ||
       (sizeof(len) == 8 && mlen < len)) {
     return 0;
   }
-  ctx->len.u[1] = mlen;
+  ctx->len.msg = mlen;
 
   if (ctx->ares) {
     // First call to decrypt finalizes GHASH(AAD)
@@ -475,8 +467,8 @@
   if (n) {
     while (n && len) {
       uint8_t c = *(in++);
-      *(out++) = c ^ ctx->EKi.c[n];
-      ctx->Xi.c[n] ^= c;
+      *(out++) = c ^ ctx->EKi[n];
+      ctx->Xi[n] ^= c;
       --len;
       n = (n + 1) % 16;
     }
@@ -488,20 +480,16 @@
     }
   }
 
-  uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
+  uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
   while (len >= GHASH_CHUNK) {
     size_t j = GHASH_CHUNK;
 
     GHASH(ctx, in, GHASH_CHUNK);
     while (j) {
-      (*block)(ctx->Yi.c, ctx->EKi.c, key);
+      (*block)(ctx->Yi, ctx->EKi, key);
       ++ctr;
-      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) {
-        CRYPTO_store_word_le(out + i,
-                             CRYPTO_load_word_le(in + i) ^
-                                 ctx->EKi.t[i / sizeof(crypto_word_t)]);
-      }
+      CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+      CRYPTO_xor16(out, in, ctx->EKi);
       out += 16;
       in += 16;
       j -= 16;
@@ -512,27 +500,23 @@
   if (len_blocks != 0) {
     GHASH(ctx, in, len_blocks);
     while (len >= 16) {
-      (*block)(ctx->Yi.c, ctx->EKi.c, key);
+      (*block)(ctx->Yi, ctx->EKi, key);
       ++ctr;
-      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) {
-        CRYPTO_store_word_le(out + i,
-                             CRYPTO_load_word_le(in + i) ^
-                                 ctx->EKi.t[i / sizeof(crypto_word_t)]);
-      }
+      CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+      CRYPTO_xor16(out, in, ctx->EKi);
       out += 16;
       in += 16;
       len -= 16;
     }
   }
   if (len) {
-    (*block)(ctx->Yi.c, ctx->EKi.c, key);
+    (*block)(ctx->Yi, ctx->EKi, key);
     ++ctr;
-    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
     while (len--) {
       uint8_t c = in[n];
-      ctx->Xi.c[n] ^= c;
-      out[n] = c ^ ctx->EKi.c[n];
+      ctx->Xi[n] ^= c;
+      out[n] = c ^ ctx->EKi[n];
       ++n;
     }
   }
@@ -545,18 +529,18 @@
                                 const uint8_t *in, uint8_t *out, size_t len,
                                 ctr128_f stream) {
 #ifdef GCM_FUNCREF
-  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
       ctx->gcm_key.gmult;
-  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+  void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
                       size_t len) = ctx->gcm_key.ghash;
 #endif
 
-  uint64_t mlen = ctx->len.u[1] + len;
+  uint64_t mlen = ctx->len.msg + len;
   if (mlen > ((UINT64_C(1) << 36) - 32) ||
       (sizeof(len) == 8 && mlen < len)) {
     return 0;
   }
-  ctx->len.u[1] = mlen;
+  ctx->len.msg = mlen;
 
   if (ctx->ares) {
     // First call to encrypt finalizes GHASH(AAD)
@@ -567,7 +551,7 @@
   unsigned n = ctx->mres;
   if (n) {
     while (n && len) {
-      ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
+      ctx->Xi[n] ^= *(out++) = *(in++) ^ ctx->EKi[n];
       --len;
       n = (n + 1) % 16;
     }
@@ -584,7 +568,7 @@
   if (ctx->gcm_key.use_hw_gcm_crypt && len > 0) {
     // |hw_gcm_encrypt| may not process all the input given to it. It may
     // not process *any* of its input if it is deemed too small.
-    size_t bulk = hw_gcm_encrypt(in, out, len, key, ctx->Yi.c, ctx->Xi.u,
+    size_t bulk = hw_gcm_encrypt(in, out, len, key, ctx->Yi, ctx->Xi,
                                  ctx->gcm_key.Htable);
     in += bulk;
     out += bulk;
@@ -592,11 +576,11 @@
   }
 #endif
 
-  uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
+  uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
   while (len >= GHASH_CHUNK) {
-    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
+    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi);
     ctr += GHASH_CHUNK / 16;
-    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
     GHASH(ctx, out, GHASH_CHUNK);
     out += GHASH_CHUNK;
     in += GHASH_CHUNK;
@@ -606,20 +590,20 @@
   if (len_blocks != 0) {
     size_t j = len_blocks / 16;
 
-    (*stream)(in, out, j, key, ctx->Yi.c);
+    (*stream)(in, out, j, key, ctx->Yi);
     ctr += (unsigned int)j;
-    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
     in += len_blocks;
     len -= len_blocks;
     GHASH(ctx, out, len_blocks);
     out += len_blocks;
   }
   if (len) {
-    (*ctx->gcm_key.block)(ctx->Yi.c, ctx->EKi.c, key);
+    (*ctx->gcm_key.block)(ctx->Yi, ctx->EKi, key);
     ++ctr;
-    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
     while (len--) {
-      ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
+      ctx->Xi[n] ^= out[n] = in[n] ^ ctx->EKi[n];
       ++n;
     }
   }
@@ -632,18 +616,18 @@
                                 const uint8_t *in, uint8_t *out, size_t len,
                                 ctr128_f stream) {
 #ifdef GCM_FUNCREF
-  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
       ctx->gcm_key.gmult;
-  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+  void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
                       size_t len) = ctx->gcm_key.ghash;
 #endif
 
-  uint64_t mlen = ctx->len.u[1] + len;
+  uint64_t mlen = ctx->len.msg + len;
   if (mlen > ((UINT64_C(1) << 36) - 32) ||
       (sizeof(len) == 8 && mlen < len)) {
     return 0;
   }
-  ctx->len.u[1] = mlen;
+  ctx->len.msg = mlen;
 
   if (ctx->ares) {
     // First call to decrypt finalizes GHASH(AAD)
@@ -655,8 +639,8 @@
   if (n) {
     while (n && len) {
       uint8_t c = *(in++);
-      *(out++) = c ^ ctx->EKi.c[n];
-      ctx->Xi.c[n] ^= c;
+      *(out++) = c ^ ctx->EKi[n];
+      ctx->Xi[n] ^= c;
       --len;
       n = (n + 1) % 16;
     }
@@ -673,7 +657,7 @@
   if (ctx->gcm_key.use_hw_gcm_crypt && len > 0) {
     // |hw_gcm_decrypt| may not process all the input given to it. It may
     // not process *any* of its input if it is deemed too small.
-    size_t bulk = hw_gcm_decrypt(in, out, len, key, ctx->Yi.c, ctx->Xi.u,
+    size_t bulk = hw_gcm_decrypt(in, out, len, key, ctx->Yi, ctx->Xi,
                                  ctx->gcm_key.Htable);
     in += bulk;
     out += bulk;
@@ -681,12 +665,12 @@
   }
 #endif
 
-  uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
+  uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
   while (len >= GHASH_CHUNK) {
     GHASH(ctx, in, GHASH_CHUNK);
-    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
+    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi);
     ctr += GHASH_CHUNK / 16;
-    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
     out += GHASH_CHUNK;
     in += GHASH_CHUNK;
     len -= GHASH_CHUNK;
@@ -696,21 +680,21 @@
     size_t j = len_blocks / 16;
 
     GHASH(ctx, in, len_blocks);
-    (*stream)(in, out, j, key, ctx->Yi.c);
+    (*stream)(in, out, j, key, ctx->Yi);
     ctr += (unsigned int)j;
-    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
     out += len_blocks;
     in += len_blocks;
     len -= len_blocks;
   }
   if (len) {
-    (*ctx->gcm_key.block)(ctx->Yi.c, ctx->EKi.c, key);
+    (*ctx->gcm_key.block)(ctx->Yi, ctx->EKi, key);
     ++ctr;
-    ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
     while (len--) {
       uint8_t c = in[n];
-      ctx->Xi.c[n] ^= c;
-      out[n] = c ^ ctx->EKi.c[n];
+      ctx->Xi[n] ^= c;
+      out[n] = c ^ ctx->EKi[n];
       ++n;
     }
   }
@@ -721,7 +705,7 @@
 
 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag, size_t len) {
 #ifdef GCM_FUNCREF
-  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) =
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
       ctx->gcm_key.gmult;
 #endif
 
@@ -729,15 +713,15 @@
     GCM_MUL(ctx, Xi);
   }
 
-  ctx->Xi.u[0] ^= CRYPTO_bswap8(ctx->len.u[0] << 3);
-  ctx->Xi.u[1] ^= CRYPTO_bswap8(ctx->len.u[1] << 3);
+  uint8_t len_block[16];
+  CRYPTO_store_u64_be(len_block, ctx->len.aad << 3);
+  CRYPTO_store_u64_be(len_block + 8, ctx->len.msg << 3);
+  CRYPTO_xor16(ctx->Xi, ctx->Xi, len_block);
   GCM_MUL(ctx, Xi);
-
-  ctx->Xi.u[0] ^= ctx->EK0.u[0];
-  ctx->Xi.u[1] ^= ctx->EK0.u[1];
+  CRYPTO_xor16(ctx->Xi, ctx->Xi, ctx->EK0);
 
   if (tag && len <= sizeof(ctx->Xi)) {
-    return CRYPTO_memcmp(ctx->Xi.c, tag, len) == 0;
+    return CRYPTO_memcmp(ctx->Xi, tag, len) == 0;
   } else {
     return 0;
   }
@@ -745,8 +729,7 @@
 
 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) {
   CRYPTO_gcm128_finish(ctx, NULL, 0);
-  OPENSSL_memcpy(tag, ctx->Xi.c,
-                 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
+  OPENSSL_memcpy(tag, ctx->Xi, len <= sizeof(ctx->Xi) ? len : sizeof(ctx->Xi));
 }
 
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
diff --git a/crypto/fipsmodule/modes/gcm_nohw.c b/crypto/fipsmodule/modes/gcm_nohw.c
index 92d5441..4a63028 100644
--- a/crypto/fipsmodule/modes/gcm_nohw.c
+++ b/crypto/fipsmodule/modes/gcm_nohw.c
@@ -274,31 +274,29 @@
   Xi[1] = r3;
 }
 
-void gcm_gmult_nohw(uint64_t Xi[2], const u128 Htable[16]) {
+void gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]) {
   uint64_t swapped[2];
-  swapped[0] = CRYPTO_bswap8(Xi[1]);
-  swapped[1] = CRYPTO_bswap8(Xi[0]);
+  swapped[0] = CRYPTO_load_u64_be(Xi + 8);
+  swapped[1] = CRYPTO_load_u64_be(Xi);
   gcm_polyval_nohw(swapped, &Htable[0]);
-  Xi[0] = CRYPTO_bswap8(swapped[1]);
-  Xi[1] = CRYPTO_bswap8(swapped[0]);
+  CRYPTO_store_u64_be(Xi, swapped[1]);
+  CRYPTO_store_u64_be(Xi + 8, swapped[0]);
 }
 
-void gcm_ghash_nohw(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+void gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
                     size_t len) {
   uint64_t swapped[2];
-  swapped[0] = CRYPTO_bswap8(Xi[1]);
-  swapped[1] = CRYPTO_bswap8(Xi[0]);
+  swapped[0] = CRYPTO_load_u64_be(Xi + 8);
+  swapped[1] = CRYPTO_load_u64_be(Xi);
 
   while (len >= 16) {
-    uint64_t block[2];
-    OPENSSL_memcpy(block, inp, 16);
-    swapped[0] ^= CRYPTO_bswap8(block[1]);
-    swapped[1] ^= CRYPTO_bswap8(block[0]);
+    swapped[0] ^= CRYPTO_load_u64_be(inp + 8);
+    swapped[1] ^= CRYPTO_load_u64_be(inp);
     gcm_polyval_nohw(swapped, &Htable[0]);
     inp += 16;
     len -= 16;
   }
 
-  Xi[0] = CRYPTO_bswap8(swapped[1]);
-  Xi[1] = CRYPTO_bswap8(swapped[0]);
+  CRYPTO_store_u64_be(Xi, swapped[1]);
+  CRYPTO_store_u64_be(Xi + 8, swapped[0]);
 }
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index cfdccb2..b4f9b90 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -129,10 +129,8 @@
   uint8_t buf[16 * 32];
   OPENSSL_memset(buf, 42, sizeof(buf));
 
-  uint64_t X[2] = {
-      UINT64_C(0x0388dace60b6a392),
-      UINT64_C(0xf328c2b971b2fe78),
-  };
+  uint8_t X[16] = {0x92, 0xa3, 0xb3, 0x60, 0xce, 0xda, 0x88, 0x03,
+                   0x78, 0xfe, 0xb2, 0x71, 0xb9, 0xc2, 0x28, 0xf3};
 
   alignas(16) u128 Htable[16];
 #if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index 560b268..3b84015 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -53,6 +53,7 @@
 
 #include <openssl/aes.h>
 
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -75,6 +76,20 @@
 typedef void (*block128_f)(const uint8_t in[16], uint8_t out[16],
                            const AES_KEY *key);
 
+OPENSSL_INLINE void CRYPTO_xor16(uint8_t out[16], const uint8_t a[16],
+                                 const uint8_t b[16]) {
+  // TODO(davidben): Ideally we'd leave this to the compiler, which could use
+  // vector registers, etc. But the compiler doesn't know that |in| and |out|
+  // cannot partially alias. |restrict| is slightly two strict (we allow exact
+  // aliasing), but perhaps in-place could be a separate function?
+  static_assert(16 % sizeof(crypto_word_t) == 0,
+                "block cannot be evenly divided into words");
+  for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) {
+    CRYPTO_store_word_le(
+        out + i, CRYPTO_load_word_le(a + i) ^ CRYPTO_load_word_le(b + i));
+  }
+}
+
 
 // CTR.
 
@@ -115,12 +130,12 @@
 
 // gmult_func multiplies |Xi| by the GCM key and writes the result back to
 // |Xi|.
-typedef void (*gmult_func)(uint64_t Xi[2], const u128 Htable[16]);
+typedef void (*gmult_func)(uint8_t Xi[16], const u128 Htable[16]);
 
 // ghash_func repeatedly multiplies |Xi| by the GCM key and adds in blocks from
 // |inp|. The result is written back to |Xi| and the |len| argument must be a
 // multiple of 16.
-typedef void (*ghash_func)(uint64_t Xi[2], const u128 Htable[16],
+typedef void (*ghash_func)(uint8_t Xi[16], const u128 Htable[16],
                            const uint8_t *inp, size_t len);
 
 typedef struct gcm128_key_st {
@@ -143,12 +158,14 @@
 // should be zero-initialized before use.
 typedef struct {
   // The following 5 names follow names in GCM specification
-  union {
-    uint64_t u[2];
-    uint32_t d[4];
-    uint8_t c[16];
-    crypto_word_t t[16 / sizeof(crypto_word_t)];
-  } Yi, EKi, EK0, len, Xi;
+  uint8_t Yi[16];
+  uint8_t EKi[16];
+  uint8_t EK0[16];
+  struct {
+    uint64_t aad;
+    uint64_t msg;
+  } len;
+  uint8_t Xi[16];
 
   // |gcm_*_ssse3| require |Htable| to be 16-byte-aligned.
   // TODO(crbug.com/boringssl/604): Revisit this.
@@ -236,8 +253,8 @@
 // GCM assembly.
 
 void gcm_init_nohw(u128 Htable[16], const uint64_t H[2]);
-void gcm_gmult_nohw(uint64_t Xi[2], const u128 Htable[16]);
-void gcm_ghash_nohw(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+void gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
                     size_t len);
 
 #if !defined(OPENSSL_NO_ASM)
@@ -245,31 +262,31 @@
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
 #define GCM_FUNCREF
 void gcm_init_clmul(u128 Htable[16], const uint64_t Xi[2]);
-void gcm_gmult_clmul(uint64_t Xi[2], const u128 Htable[16]);
-void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+void gcm_gmult_clmul(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_clmul(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
                      size_t len);
 
 // |gcm_gmult_ssse3| and |gcm_ghash_ssse3| require |Htable| to be
 // 16-byte-aligned, but |gcm_init_ssse3| does not.
 void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]);
-void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
-void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
+void gcm_gmult_ssse3(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_ssse3(uint8_t Xi[16], const u128 Htable[16], const uint8_t *in,
                      size_t len);
 
 #if defined(OPENSSL_X86_64)
 #define GHASH_ASM_X86_64
 void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
-void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
-void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
+void gcm_gmult_avx(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_avx(uint8_t Xi[16], const u128 Htable[16], const uint8_t *in,
                    size_t len);
 
 #define HW_GCM
 size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                          const AES_KEY *key, uint8_t ivec[16],
-                         const u128 Htable[16], uint64_t *Xi);
+                         const u128 Htable[16], uint8_t Xi[16]);
 size_t aesni_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
                          const AES_KEY *key, uint8_t ivec[16],
-                         const u128 Htable[16], uint64_t *Xi);
+                         const u128 Htable[16], uint8_t Xi[16]);
 #endif  // OPENSSL_X86_64
 
 #if defined(OPENSSL_X86)
@@ -285,16 +302,16 @@
   return CRYPTO_is_ARMv8_PMULL_capable();
 }
 
-void gcm_init_v8(u128 Htable[16], const uint64_t Xi[2]);
-void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]);
-void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+void gcm_init_v8(u128 Htable[16], const uint64_t H[2]);
+void gcm_gmult_v8(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_v8(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
                   size_t len);
 
 OPENSSL_INLINE int gcm_neon_capable(void) { return CRYPTO_is_NEON_capable(); }
 
-void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]);
-void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]);
-void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+void gcm_init_neon(u128 Htable[16], const uint64_t H[2]);
+void gcm_gmult_neon(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_neon(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
                     size_t len);
 
 #if defined(OPENSSL_AARCH64)
@@ -383,7 +400,7 @@
 // https://www.rfc-editor.org/rfc/rfc8452.html#section-3.
 
 struct polyval_ctx {
-  uint64_t S[2];
+  uint8_t S[16];
   // |gcm_*_ssse3| require |Htable| to be 16-byte-aligned.
   // TODO(crbug.com/boringssl/604): Revisit this.
   alignas(16) u128 Htable[16];
diff --git a/crypto/fipsmodule/modes/ofb.c b/crypto/fipsmodule/modes/ofb.c
index 5effba6..9260f2d 100644
--- a/crypto/fipsmodule/modes/ofb.c
+++ b/crypto/fipsmodule/modes/ofb.c
@@ -70,14 +70,7 @@
 
   while (len >= 16) {
     (*block)(ivec, ivec, key);
-    for (; n < 16; n += sizeof(size_t)) {
-      size_t a, b;
-      OPENSSL_memcpy(&a, in + n, sizeof(size_t));
-      OPENSSL_memcpy(&b, ivec + n, sizeof(size_t));
-
-      const size_t c = a ^ b;
-      OPENSSL_memcpy(out + n, &c, sizeof(size_t));
-    }
+    CRYPTO_xor16(out, in, ivec);
     len -= 16;
     out += 16;
     in += 16;