Appease UBSan on pointer alignment.

Even without strict-aliasing, C does not allow casting pointers to types
that don't match their alignment. After this change, UBSan is happy with
our code at default settings but for the negative left shift language
bug.

Note: architectures without unaligned loads do not generate the same
code for memcpy and pointer casts. But even ARMv6 can perform unaligned
loads and stores (ARMv5 couldn't), so we should be okay here.

Before:
Did 11086000 AES-128-GCM (16 bytes) seal operations in 5000391us (2217026.6 ops/sec): 35.5 MB/s
Did 370000 AES-128-GCM (1350 bytes) seal operations in 5005208us (73923.0 ops/sec): 99.8 MB/s
Did 63000 AES-128-GCM (8192 bytes) seal operations in 5029958us (12525.0 ops/sec): 102.6 MB/s
Did 9894000 AES-256-GCM (16 bytes) seal operations in 5000017us (1978793.3 ops/sec): 31.7 MB/s
Did 316000 AES-256-GCM (1350 bytes) seal operations in 5005564us (63129.7 ops/sec): 85.2 MB/s
Did 54000 AES-256-GCM (8192 bytes) seal operations in 5054156us (10684.3 ops/sec): 87.5 MB/s

After:
Did 11026000 AES-128-GCM (16 bytes) seal operations in 5000197us (2205113.1 ops/sec): 35.3 MB/s
Did 370000 AES-128-GCM (1350 bytes) seal operations in 5005781us (73914.5 ops/sec): 99.8 MB/s
Did 63000 AES-128-GCM (8192 bytes) seal operations in 5032695us (12518.1 ops/sec): 102.5 MB/s
Did 9831750 AES-256-GCM (16 bytes) seal operations in 5000010us (1966346.1 ops/sec): 31.5 MB/s
Did 316000 AES-256-GCM (1350 bytes) seal operations in 5005702us (63128.0 ops/sec): 85.2 MB/s
Did 54000 AES-256-GCM (8192 bytes) seal operations in 5053642us (10685.4 ops/sec): 87.5 MB/s

(Tested with the no-asm builds; most of this code isn't reachable
otherwise.)

Change-Id: I025c365d26491abed0116b0de3b7612159e52297
Reviewed-on: https://boringssl-review.googlesource.com/22804
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/modes/cbc.c b/crypto/fipsmodule/modes/cbc.c
index 4b3bdb8..db9f024 100644
--- a/crypto/fipsmodule/modes/cbc.c
+++ b/crypto/fipsmodule/modes/cbc.c
@@ -62,7 +62,8 @@
   assert(len == 0 || (in != NULL && out != NULL));
 
   if (STRICT_ALIGNMENT &&
-      ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+      ((uintptr_t)in | (uintptr_t)out | (uintptr_t)ivec) % sizeof(size_t) !=
+          0) {
     while (len >= 16) {
       for (n = 0; n < 16; ++n) {
         out[n] = in[n] ^ iv[n];
@@ -76,7 +77,7 @@
   } else {
     while (len >= 16) {
       for (n = 0; n < 16; n += sizeof(size_t)) {
-        *(size_t *)(out + n) = *(size_t *)(in + n) ^ *(size_t *)(iv + n);
+        store_word_le(out + n, load_word_le(in + n) ^ load_word_le(iv + n));
       }
       (*block)(out, out, key);
       iv = out;
@@ -129,7 +130,8 @@
     const uint8_t *iv = ivec;
 
     if (STRICT_ALIGNMENT &&
-        ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+        ((uintptr_t)in | (uintptr_t)out | (uintptr_t)ivec) % sizeof(size_t) !=
+            0) {
       while (len >= 16) {
         (*block)(in, out, key);
         for (n = 0; n < 16; ++n) {
@@ -142,11 +144,9 @@
       }
     } else if (16 % sizeof(size_t) == 0) {  // always true
       while (len >= 16) {
-        size_t *out_t = (size_t *)out, *iv_t = (size_t *)iv;
-
         (*block)(in, out, key);
-        for (n = 0; n < 16 / sizeof(size_t); n++) {
-          out_t[n] ^= iv_t[n];
+        for (n = 0; n < 16; n += sizeof(size_t)) {
+          store_word_le(out + n, load_word_le(out + n) ^ load_word_le(iv + n));
         }
         iv = in;
         len -= 16;
@@ -160,7 +160,8 @@
     // directly to |out| would overwrite a ciphertext block before it is used as
     // the next block's IV. Decrypt to a temporary block instead.
     if (STRICT_ALIGNMENT &&
-        ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+        ((uintptr_t)in | (uintptr_t)out | (uintptr_t)ivec) % sizeof(size_t) !=
+            0) {
       uint8_t c;
       while (len >= 16) {
         (*block)(in, tmp.c, key);
@@ -175,14 +176,12 @@
       }
     } else if (16 % sizeof(size_t) == 0) {  // always true
       while (len >= 16) {
-        size_t c, *out_t = (size_t *)out, *ivec_t = (size_t *)ivec;
-        const size_t *in_t = (const size_t *)in;
-
         (*block)(in, tmp.c, key);
-        for (n = 0; n < 16 / sizeof(size_t); n++) {
-          c = in_t[n];
-          out_t[n] = tmp.t[n] ^ ivec_t[n];
-          ivec_t[n] = c;
+        for (n = 0; n < 16; n += sizeof(size_t)) {
+          size_t c = load_word_le(in + n);
+          store_word_le(out + n,
+                        tmp.t[n / sizeof(size_t)] ^ load_word_le(ivec + n));
+          store_word_le(ivec + n, c);
         }
         len -= 16;
         in += 16;
diff --git a/crypto/fipsmodule/modes/cfb.c b/crypto/fipsmodule/modes/cfb.c
index 2775d19..e1b0a80 100644
--- a/crypto/fipsmodule/modes/cfb.c
+++ b/crypto/fipsmodule/modes/cfb.c
@@ -72,7 +72,8 @@
       n = (n + 1) % 16;
     }
 #if STRICT_ALIGNMENT
-    if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+    if (((uintptr_t)in | (uintptr_t)out | (uintptr_t)ivec) % sizeof(size_t) !=
+        0) {
       while (l < len) {
         if (n == 0) {
           (*block)(ivec, ivec, key);
@@ -88,7 +89,9 @@
     while (len >= 16) {
       (*block)(ivec, ivec, key);
       for (; n < 16; n += sizeof(size_t)) {
-        *(size_t *)(out + n) = *(size_t *)(ivec + n) ^= *(size_t *)(in + n);
+        size_t tmp = load_word_le(ivec + n) ^ load_word_le(in + n);
+        store_word_le(ivec + n, tmp);
+        store_word_le(out + n, tmp);
       }
       len -= 16;
       out += 16;
@@ -112,9 +115,11 @@
       --len;
       n = (n + 1) % 16;
     }
-    if (STRICT_ALIGNMENT && ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+    if (STRICT_ALIGNMENT &&
+        ((uintptr_t)in | (uintptr_t)out | (uintptr_t)ivec) % sizeof(size_t) !=
+            0) {
       while (l < len) {
-        unsigned char c;
+        uint8_t c;
         if (n == 0) {
           (*block)(ivec, ivec, key);
         }
@@ -129,9 +134,9 @@
     while (len >= 16) {
       (*block)(ivec, ivec, key);
       for (; n < 16; n += sizeof(size_t)) {
-        size_t t = *(size_t *)(in + n);
-        *(size_t *)(out + n) = *(size_t *)(ivec + n) ^ t;
-        *(size_t *)(ivec + n) = t;
+        size_t t = load_word_le(in + n);
+        store_word_le(out + n, load_word_le(ivec + n) ^ t);
+        store_word_le(ivec + n, t);
       }
       len -= 16;
       out += 16;
@@ -227,4 +232,3 @@
     cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
   }
 }
-
diff --git a/crypto/fipsmodule/modes/ctr.c b/crypto/fipsmodule/modes/ctr.c
index 5a97cf6..63907b4 100644
--- a/crypto/fipsmodule/modes/ctr.c
+++ b/crypto/fipsmodule/modes/ctr.c
@@ -100,7 +100,8 @@
   }
 
 #if STRICT_ALIGNMENT
-  if (((size_t)in | (size_t)out | (size_t)ecount_buf) % sizeof(size_t) != 0) {
+  if (((uintptr_t)in | (uintptr_t)out |
+        (uintptr_t)ecount_buf) % sizeof(size_t) != 0) {
     size_t l = 0;
     while (l < len) {
       if (n == 0) {
@@ -121,8 +122,8 @@
     (*block)(ivec, ecount_buf, key);
     ctr128_inc(ivec);
     for (n = 0; n < 16; n += sizeof(size_t)) {
-      *(size_t *)(out + n) = *(const size_t *)(in + n) ^
-                             *(const size_t *)(ecount_buf + n);
+      store_word_le(out + n,
+                    load_word_le(in + n) ^ load_word_le(ecount_buf + n));
     }
     len -= 16;
     out += 16;
diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c
index bb5be54..05cd18d 100644
--- a/crypto/fipsmodule/modes/gcm.c
+++ b/crypto/fipsmodule/modes/gcm.c
@@ -550,8 +550,7 @@
 }
 
 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const void *key,
-                          const unsigned char *in, unsigned char *out,
-                          size_t len) {
+                          const uint8_t *in, uint8_t *out, size_t len) {
   unsigned int n, ctr;
   uint64_t mlen = ctx->len.u[1];
   block128_f block = ctx->block;
@@ -592,7 +591,8 @@
       return 1;
     }
   }
-  if (STRICT_ALIGNMENT && ((size_t)in | (size_t)out) % sizeof(size_t) != 0) {
+  if (STRICT_ALIGNMENT &&
+      ((uintptr_t)in | (uintptr_t)out) % sizeof(size_t) != 0) {
     for (size_t i = 0; i < len; ++i) {
       if (n == 0) {
         (*block)(ctx->Yi.c, ctx->EKi.c, key);
@@ -614,14 +614,12 @@
     size_t j = GHASH_CHUNK;
 
     while (j) {
-      size_t *out_t = (size_t *)out;
-      const size_t *in_t = (const size_t *)in;
-
       (*block)(ctx->Yi.c, ctx->EKi.c, key);
       ++ctr;
       ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
-        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
+        store_word_le(out + i,
+                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
       }
       out += 16;
       in += 16;
@@ -633,14 +631,12 @@
   size_t len_blocks = len & kSizeTWithoutLower4Bits;
   if (len_blocks != 0) {
     while (len >= 16) {
-      size_t *out_t = (size_t *)out;
-      const size_t *in_t = (const size_t *)in;
-
       (*block)(ctx->Yi.c, ctx->EKi.c, key);
       ++ctr;
       ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
-        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
+        store_word_le(out + i,
+                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
       }
       out += 16;
       in += 16;
@@ -650,14 +646,13 @@
   }
 #else
   while (len >= 16) {
-    size_t *out_t = (size_t *)out;
-    const size_t *in_t = (const size_t *)in;
-
     (*block)(ctx->Yi.c, ctx->EKi.c, key);
     ++ctr;
     ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-    for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
-      ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+    for (size_t i = 0; i < 16; i += sizeof(size_t)) {
+      size_t tmp = load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)];
+      store_word_le(out + i, tmp);
+      ctx->Xi.t[i / sizeof(size_t)] ^= tmp;
     }
     GCM_MUL(ctx, Xi);
     out += 16;
@@ -724,7 +719,8 @@
       return 1;
     }
   }
-  if (STRICT_ALIGNMENT && ((size_t)in | (size_t)out) % sizeof(size_t) != 0) {
+  if (STRICT_ALIGNMENT &&
+      ((uintptr_t)in | (uintptr_t)out) % sizeof(size_t) != 0) {
     for (size_t i = 0; i < len; ++i) {
       uint8_t c;
       if (n == 0) {
@@ -750,14 +746,12 @@
 
     GHASH(ctx, in, GHASH_CHUNK);
     while (j) {
-      size_t *out_t = (size_t *)out;
-      const size_t *in_t = (const size_t *)in;
-
       (*block)(ctx->Yi.c, ctx->EKi.c, key);
       ++ctr;
       ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
-        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
+        store_word_le(out + i,
+                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
       }
       out += 16;
       in += 16;
@@ -769,14 +763,12 @@
   if (len_blocks != 0) {
     GHASH(ctx, in, len_blocks);
     while (len >= 16) {
-      size_t *out_t = (size_t *)out;
-      const size_t *in_t = (const size_t *)in;
-
       (*block)(ctx->Yi.c, ctx->EKi.c, key);
       ++ctr;
       ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
-        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
+        store_word_le(out + i,
+                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
       }
       out += 16;
       in += 16;
@@ -785,16 +777,13 @@
   }
 #else
   while (len >= 16) {
-    size_t *out_t = (size_t *)out;
-    const size_t *in_t = (const size_t *)in;
-
     (*block)(ctx->Yi.c, ctx->EKi.c, key);
     ++ctr;
     ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-    for (size_t i = 0; i < 16 / sizeof(size_t); ++i) {
-      size_t c = in_t[i];
-      out_t[i] = c ^ ctx->EKi.t[i];
-      ctx->Xi.t[i] ^= c;
+    for (size_t i = 0; i < 16; i += sizeof(size_t)) {
+      size_t c = load_word_le(in + i);
+      store_word_le(out + i, c ^ ctx->EKi.t[i / sizeof(size_t)]);
+      ctx->Xi.t[i / sizeof(size_t)] ^= c;
     }
     GCM_MUL(ctx, Xi);
     out += 16;
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index 6a5ff99..f6ee8f4 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -109,6 +109,16 @@
   OPENSSL_memcpy(out, &v, sizeof(v));
 }
 
+static inline size_t load_word_le(const void *in) {
+  size_t v;
+  OPENSSL_memcpy(&v, in, sizeof(v));
+  return v;
+}
+
+static inline void store_word_le(void *out, size_t v) {
+  OPENSSL_memcpy(out, &v, sizeof(v));
+}
+
 // block128_f is the type of a 128-bit, block cipher.
 typedef void (*block128_f)(const uint8_t in[16], uint8_t out[16],
                            const void *key);