Remove STRICT_ALIGNMENT code from modes.

STRICT_ALIGNMENT is a remnant of OpenSSL code would cast pointers to
size_t* and load more than one byte at a time. Not all architectures
support unaligned access, so it did an alignment check and only enterred
this path if aligned or the underlying architecture didn't care.

This is UB. Unaligned casts in C are undefined on all architectures, so
we switch these to memcpy some time ago. Compilers can optimize memcpy
to the unaligned accesses we wanted. That left our modes logic as:

- If STRICT_ALIGNMENT is 1 and things are unaligned, work byte-by-byte.

- Otherwise, use the memcpy-based word-by-word code, which now works
  independent of STRICT_ALIGNMENT.

Remove the first check to simplify things. On x86, x86_64, and aarch64,
STRICT_ALIGNMENT is zero and this is a no-op. ARM is more complex. Per
[0], ARMv7 and up support unaligned access. ARMv5 do not. ARMv6 does,
but can run in a mode where it looks more like ARMv5.

For ARMv7 and up, STRICT_ALIGNMENT should have been zero, but was one.
Thus this change should be an improvement for ARMv7 (right now unaligned
inputs lose bsaes-armv7). The Android NDK does not even support the
pre-ARMv7 ABI anymore[1]. Nonetheless, Cronet still supports ARMv6 as a
library. It builds with -march=armv6 which GCC interprets as supporting
unaligned access, so it too did not want this code.

For completeness, should anyone still care about ARMv5 or be building
with an overly permissive -march flag, GCC does appear unable to inline
the memcpy calls. However, GCC also does not interpret
(uintptr_t)ptr % sizeof(size_t) as an alignment assertion, so such
consumers have already been paying for the memcpy here and throughout
the library.

In general, C's arcane pointer rules mean we must resort to memcpy
often, so, realistically, we must require that the compiler optimize
memcpy well.

[0] https://medium.com/@iLevex/the-curious-case-of-unaligned-access-on-arm-5dd0ebe24965
[1] https://developer.android.com/ndk/guides/abis#armeabi

Change-Id: I3c7dea562adaeb663032e395499e69530dd8e145
Reviewed-on: https://boringssl-review.googlesource.com/c/34873
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/modes/cbc.c b/crypto/fipsmodule/modes/cbc.c
index 64ea505..3f1d777 100644
--- a/crypto/fipsmodule/modes/cbc.c
+++ b/crypto/fipsmodule/modes/cbc.c
@@ -49,6 +49,8 @@
 #include <assert.h>
 #include <string.h>
 
+#include <openssl/type_check.h>
+
 #include "internal.h"
 
 
@@ -61,30 +63,15 @@
   assert(key != NULL && ivec != NULL);
   assert(len == 0 || (in != NULL && out != NULL));
 
-  if (STRICT_ALIGNMENT &&
-      ((uintptr_t)in | (uintptr_t)out | (uintptr_t)ivec) % sizeof(size_t) !=
-          0) {
-    while (len >= 16) {
-      for (n = 0; n < 16; ++n) {
-        out[n] = in[n] ^ iv[n];
-      }
-      (*block)(out, out, key);
-      iv = out;
-      len -= 16;
-      in += 16;
-      out += 16;
+  while (len >= 16) {
+    for (n = 0; n < 16; n += sizeof(size_t)) {
+      store_word_le(out + n, load_word_le(in + n) ^ load_word_le(iv + n));
     }
-  } else {
-    while (len >= 16) {
-      for (n = 0; n < 16; n += sizeof(size_t)) {
-        store_word_le(out + n, load_word_le(in + n) ^ load_word_le(iv + n));
-      }
-      (*block)(out, out, key);
-      iv = out;
-      len -= 16;
-      in += 16;
-      out += 16;
-    }
+    (*block)(out, out, key);
+    iv = out;
+    len -= 16;
+    in += 16;
+    out += 16;
   }
 
   while (len) {
@@ -127,66 +114,35 @@
   if ((inptr >= 32 && outptr <= inptr - 32) || inptr < outptr) {
     // If |out| is at least two blocks behind |in| or completely disjoint, there
     // is no need to decrypt to a temporary block.
+    OPENSSL_STATIC_ASSERT(16 % sizeof(size_t) == 0,
+                          "block cannot be evenly divided into words");
     const uint8_t *iv = ivec;
-
-    if (STRICT_ALIGNMENT &&
-        ((uintptr_t)in | (uintptr_t)out | (uintptr_t)ivec) % sizeof(size_t) !=
-            0) {
-      while (len >= 16) {
-        (*block)(in, out, key);
-        for (n = 0; n < 16; ++n) {
-          out[n] ^= iv[n];
-        }
-        iv = in;
-        len -= 16;
-        in += 16;
-        out += 16;
+    while (len >= 16) {
+      (*block)(in, out, key);
+      for (n = 0; n < 16; n += sizeof(size_t)) {
+        store_word_le(out + n, load_word_le(out + n) ^ load_word_le(iv + n));
       }
-    } else if (16 % sizeof(size_t) == 0) {  // always true
-      while (len >= 16) {
-        (*block)(in, out, key);
-        for (n = 0; n < 16; n += sizeof(size_t)) {
-          store_word_le(out + n, load_word_le(out + n) ^ load_word_le(iv + n));
-        }
-        iv = in;
-        len -= 16;
-        in += 16;
-        out += 16;
-      }
+      iv = in;
+      len -= 16;
+      in += 16;
+      out += 16;
     }
     OPENSSL_memcpy(ivec, iv, 16);
   } else {
-    // |out| is less than two blocks behind |in|. Decrypting an input block
-    // directly to |out| would overwrite a ciphertext block before it is used as
-    // the next block's IV. Decrypt to a temporary block instead.
-    if (STRICT_ALIGNMENT &&
-        ((uintptr_t)in | (uintptr_t)out | (uintptr_t)ivec) % sizeof(size_t) !=
-            0) {
-      uint8_t c;
-      while (len >= 16) {
-        (*block)(in, tmp.c, key);
-        for (n = 0; n < 16; ++n) {
-          c = in[n];
-          out[n] = tmp.c[n] ^ ivec[n];
-          ivec[n] = c;
-        }
-        len -= 16;
-        in += 16;
-        out += 16;
+    OPENSSL_STATIC_ASSERT(16 % sizeof(size_t) == 0,
+                          "block cannot be evenly divided into words");
+
+    while (len >= 16) {
+      (*block)(in, tmp.c, key);
+      for (n = 0; n < 16; n += sizeof(size_t)) {
+        size_t c = load_word_le(in + n);
+        store_word_le(out + n,
+                      tmp.t[n / sizeof(size_t)] ^ load_word_le(ivec + n));
+        store_word_le(ivec + n, c);
       }
-    } else if (16 % sizeof(size_t) == 0) {  // always true
-      while (len >= 16) {
-        (*block)(in, tmp.c, key);
-        for (n = 0; n < 16; n += sizeof(size_t)) {
-          size_t c = load_word_le(in + n);
-          store_word_le(out + n,
-                        tmp.t[n / sizeof(size_t)] ^ load_word_le(ivec + n));
-          store_word_le(ivec + n, c);
-        }
-        len -= 16;
-        in += 16;
-        out += 16;
-      }
+      len -= 16;
+      in += 16;
+      out += 16;
     }
   }
 
diff --git a/crypto/fipsmodule/modes/cfb.c b/crypto/fipsmodule/modes/cfb.c
index 0a81f3b..8ca9004 100644
--- a/crypto/fipsmodule/modes/cfb.c
+++ b/crypto/fipsmodule/modes/cfb.c
@@ -60,8 +60,6 @@
 void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                            const AES_KEY *key, uint8_t ivec[16], unsigned *num,
                            int enc, block128_f block) {
-  size_t l = 0;
-
   assert(in && out && key && ivec && num);
 
   unsigned n = *num;
@@ -72,21 +70,6 @@
       --len;
       n = (n + 1) % 16;
     }
-#if STRICT_ALIGNMENT
-    if (((uintptr_t)in | (uintptr_t)out | (uintptr_t)ivec) % sizeof(size_t) !=
-        0) {
-      while (l < len) {
-        if (n == 0) {
-          (*block)(ivec, ivec, key);
-        }
-        out[l] = ivec[n] ^= in[l];
-        ++l;
-        n = (n + 1) % 16;
-      }
-      *num = n;
-      return;
-    }
-#endif
     while (len >= 16) {
       (*block)(ivec, ivec, key);
       for (; n < 16; n += sizeof(size_t)) {
@@ -116,22 +99,6 @@
       --len;
       n = (n + 1) % 16;
     }
-    if (STRICT_ALIGNMENT &&
-        ((uintptr_t)in | (uintptr_t)out | (uintptr_t)ivec) % sizeof(size_t) !=
-            0) {
-      while (l < len) {
-        uint8_t c;
-        if (n == 0) {
-          (*block)(ivec, ivec, key);
-        }
-        out[l] = ivec[n] ^ (c = in[l]);
-        ivec[n] = c;
-        ++l;
-        n = (n + 1) % 16;
-      }
-      *num = n;
-      return;
-    }
     while (len >= 16) {
       (*block)(ivec, ivec, key);
       for (; n < 16; n += sizeof(size_t)) {
diff --git a/crypto/fipsmodule/modes/ctr.c b/crypto/fipsmodule/modes/ctr.c
index b806b9a..8b0e059 100644
--- a/crypto/fipsmodule/modes/ctr.c
+++ b/crypto/fipsmodule/modes/ctr.c
@@ -99,26 +99,6 @@
     --len;
     n = (n + 1) % 16;
   }
-
-#if STRICT_ALIGNMENT
-  if (((uintptr_t)in | (uintptr_t)out |
-        (uintptr_t)ecount_buf) % sizeof(size_t) != 0) {
-    size_t l = 0;
-    while (l < len) {
-      if (n == 0) {
-        (*block)(ivec, ecount_buf, key);
-        ctr128_inc(ivec);
-      }
-      out[l] = in[l] ^ ecount_buf[n];
-      ++l;
-      n = (n + 1) % 16;
-    }
-
-    *num = n;
-    return;
-  }
-#endif
-
   while (len >= 16) {
     (*block)(ivec, ecount_buf, key);
     ctr128_inc(ivec);
diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c
index 97fde3e..f92f675 100644
--- a/crypto/fipsmodule/modes/gcm.c
+++ b/crypto/fipsmodule/modes/gcm.c
@@ -514,24 +514,6 @@
   }
 
   uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
-  if (STRICT_ALIGNMENT &&
-      ((uintptr_t)in | (uintptr_t)out) % sizeof(size_t) != 0) {
-    for (size_t i = 0; i < len; ++i) {
-      if (n == 0) {
-        (*block)(ctx->Yi.c, ctx->EKi.c, key);
-        ++ctr;
-        ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      }
-      ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
-      n = (n + 1) % 16;
-      if (n == 0) {
-        GCM_MUL(ctx, Xi);
-      }
-    }
-
-    ctx->mres = n;
-    return 1;
-  }
   while (len >= GHASH_CHUNK) {
     size_t j = GHASH_CHUNK;
 
@@ -622,27 +604,6 @@
   }
 
   uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]);
-  if (STRICT_ALIGNMENT &&
-      ((uintptr_t)in | (uintptr_t)out) % sizeof(size_t) != 0) {
-    for (size_t i = 0; i < len; ++i) {
-      uint8_t c;
-      if (n == 0) {
-        (*block)(ctx->Yi.c, ctx->EKi.c, key);
-        ++ctr;
-        ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      }
-      c = in[i];
-      out[i] = c ^ ctx->EKi.c[n];
-      ctx->Xi.c[n] ^= c;
-      n = (n + 1) % 16;
-      if (n == 0) {
-        GCM_MUL(ctx, Xi);
-      }
-    }
-
-    ctx->mres = n;
-    return 1;
-  }
   while (len >= GHASH_CHUNK) {
     size_t j = GHASH_CHUNK;
 
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index 18ec60c..ed1160b 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -64,12 +64,6 @@
 #endif
 
 
-#define STRICT_ALIGNMENT 1
-#if defined(OPENSSL_X86_64) || defined(OPENSSL_X86) || defined(OPENSSL_AARCH64)
-#undef STRICT_ALIGNMENT
-#define STRICT_ALIGNMENT 0
-#endif
-
 static inline uint32_t GETU32(const void *in) {
   uint32_t v;
   OPENSSL_memcpy(&v, in, sizeof(v));