Tidy up poly1305 alignment a bit

poly1305_vec.cc was not using the align_pointer helper. poly1305_arm.cc
was a bit of a mess. It had static asserts that we could align the
overall struct, but it instead only aligned the array. But there's a
size_t there, so actually the whole thing needs to be aligned.

Change-Id: Icf11544a83b2388a94ca54cda59a5a86f2708991
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/94787
Presubmit-BoringSSL-Verified: boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: David Benjamin <davidben@google.com>
Commit-Queue: Lily Chen <chlily@google.com>
Reviewed-by: Lily Chen <chlily@google.com>
diff --git a/crypto/poly1305/poly1305_arm.cc b/crypto/poly1305/poly1305_arm.cc
index 210715c..6620d70 100644
--- a/crypto/poly1305/poly1305_arm.cc
+++ b/crypto/poly1305/poly1305_arm.cc
@@ -39,7 +39,7 @@
 extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y,
                       const fe1305x2 *c);
 
-extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const uint8_t *in,
+extern int blocks(fe1305x2 *h, const fe1305x2 precomp[2], const uint8_t *in,
                   size_t inlen);
 }
 
@@ -108,16 +108,6 @@
   r->v[8] = y4;
 }
 
-static void store32(uint8_t out[4], uint32_t v) { OPENSSL_memcpy(out, &v, 4); }
-
-// load32 exists to avoid breaking strict aliasing rules in
-// fe1305x2_frombytearray.
-static uint32_t load32(const uint8_t t[4]) {
-  uint32_t tmp;
-  OPENSSL_memcpy(&tmp, t, sizeof(tmp));
-  return tmp;
-}
-
 static void fe1305x2_tobytearray(uint8_t r[16], fe1305x2 *x) {
   uint32_t x0 = x->v[0];
   uint32_t x1 = x->v[2];
@@ -134,10 +124,10 @@
   x4 += x3 >> 26;
   x3 &= 0x3ffffff;
 
-  store32(r, x0 + (x1 << 26));
-  store32(r + 4, (x1 >> 6) + (x2 << 20));
-  store32(r + 8, (x2 >> 12) + (x3 << 14));
-  store32(r + 12, (x3 >> 18) + (x4 << 8));
+  CRYPTO_store_u32_le(r, x0 + (x1 << 26));
+  CRYPTO_store_u32_le(r + 4, (x1 >> 6) + (x2 << 20));
+  CRYPTO_store_u32_le(r + 8, (x2 >> 12) + (x3 << 14));
+  CRYPTO_store_u32_le(r + 12, (x3 >> 18) + (x4 << 8));
 }
 
 static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x, size_t xlen) {
@@ -154,11 +144,11 @@
     t[i] = 0;
   }
 
-  r->v[0] = 0x3ffffff & load32(t);
-  r->v[2] = 0x3ffffff & (load32(t + 3) >> 2);
-  r->v[4] = 0x3ffffff & (load32(t + 6) >> 4);
-  r->v[6] = 0x3ffffff & (load32(t + 9) >> 6);
-  r->v[8] = load32(t + 13);
+  r->v[0] = 0x3ffffff & CRYPTO_load_u32_le(t);
+  r->v[2] = 0x3ffffff & (CRYPTO_load_u32_le(t + 3) >> 2);
+  r->v[4] = 0x3ffffff & (CRYPTO_load_u32_le(t + 6) >> 4);
+  r->v[6] = 0x3ffffff & (CRYPTO_load_u32_le(t + 9) >> 6);
+  r->v[8] = CRYPTO_load_u32_le(t + 13);
 
   if (xlen) {
     for (i = 0; (i < 16) && (i < xlen); i++) {
@@ -169,11 +159,11 @@
       t[i] = 0;
     }
 
-    r->v[1] = 0x3ffffff & load32(t);
-    r->v[3] = 0x3ffffff & (load32(t + 3) >> 2);
-    r->v[5] = 0x3ffffff & (load32(t + 6) >> 4);
-    r->v[7] = 0x3ffffff & (load32(t + 9) >> 6);
-    r->v[9] = load32(t + 13);
+    r->v[1] = 0x3ffffff & CRYPTO_load_u32_le(t);
+    r->v[3] = 0x3ffffff & (CRYPTO_load_u32_le(t + 3) >> 2);
+    r->v[5] = 0x3ffffff & (CRYPTO_load_u32_le(t + 6) >> 4);
+    r->v[7] = 0x3ffffff & (CRYPTO_load_u32_le(t + 9) >> 6);
+    r->v[9] = CRYPTO_load_u32_le(t + 13);
   } else {
     r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0;
   }
@@ -182,7 +172,7 @@
 static const fe1305x2 zero alignas(16) = {0};
 
 struct poly1305_state_st {
-  uint8_t data[sizeof(fe1305x2[5]) + 128];
+  fe1305x2 r, h, c, precomp[2];
   uint8_t buf[32];
   size_t buf_used;
   uint8_t key[16];
@@ -192,18 +182,21 @@
     sizeof(struct poly1305_state_st) + 63 <= sizeof(poly1305_state),
     "poly1305_state isn't large enough to hold aligned poly1305_state_st.");
 
-void CRYPTO_poly1305_init_neon(poly1305_state *state, const uint8_t key[32]) {
-  struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
-  fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
-  fe1305x2 *const h = r + 1;
-  fe1305x2 *const c = h + 1;
-  fe1305x2 *const precomp = c + 1;
+static poly1305_state_st *poly1305_aligned_state(poly1305_state *state) {
+  return reinterpret_cast<poly1305_state_st *>(align_pointer(state, 64));
+}
 
-  r->v[1] = r->v[0] = 0x3ffffff & load32(key);
-  r->v[3] = r->v[2] = 0x3ffff03 & (load32(key + 3) >> 2);
-  r->v[5] = r->v[4] = 0x3ffc0ff & (load32(key + 6) >> 4);
-  r->v[7] = r->v[6] = 0x3f03fff & (load32(key + 9) >> 6);
-  r->v[9] = r->v[8] = 0x00fffff & (load32(key + 12) >> 8);
+void CRYPTO_poly1305_init_neon(poly1305_state *state, const uint8_t key[32]) {
+  poly1305_state_st *st = poly1305_aligned_state(state);
+  fe1305x2 *const r = &st->r;
+  fe1305x2 *const h = &st->h;
+  fe1305x2 *const precomp = st->precomp;
+
+  r->v[1] = r->v[0] = 0x3ffffff & CRYPTO_load_u32_le(key);
+  r->v[3] = r->v[2] = 0x3ffff03 & (CRYPTO_load_u32_le(key + 3) >> 2);
+  r->v[5] = r->v[4] = 0x3ffc0ff & (CRYPTO_load_u32_le(key + 6) >> 4);
+  r->v[7] = r->v[6] = 0x3f03fff & (CRYPTO_load_u32_le(key + 9) >> 6);
+  r->v[9] = r->v[8] = 0x00fffff & (CRYPTO_load_u32_le(key + 12) >> 8);
 
   for (size_t j = 0; j < 10; j++) {
     h->v[j] = 0;  // XXX: should fast-forward a bit
@@ -218,11 +211,10 @@
 
 void CRYPTO_poly1305_update_neon(poly1305_state *state, const uint8_t *in,
                                  size_t in_len) {
-  struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
-  fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
-  fe1305x2 *const h = r + 1;
-  fe1305x2 *const c = h + 1;
-  fe1305x2 *const precomp = c + 1;
+  poly1305_state_st *st = poly1305_aligned_state(state);
+  fe1305x2 *const h = &st->h;
+  fe1305x2 *const c = &st->c;
+  fe1305x2 *const precomp = st->precomp;
 
   if (st->buf_used) {
     size_t todo = 32 - st->buf_used;
@@ -265,11 +257,11 @@
 }
 
 void CRYPTO_poly1305_finish_neon(poly1305_state *state, uint8_t mac[16]) {
-  struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
-  fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
-  fe1305x2 *const h = r + 1;
-  fe1305x2 *const c = h + 1;
-  fe1305x2 *const precomp = c + 1;
+  poly1305_state_st *st = poly1305_aligned_state(state);
+  fe1305x2 *const r = &st->r;
+  fe1305x2 *const h = &st->h;
+  fe1305x2 *const c = &st->c;
+  fe1305x2 *const precomp = st->precomp;
 
   addmulmod(h, h, precomp, &zero);
 
diff --git a/crypto/poly1305/poly1305_vec.cc b/crypto/poly1305/poly1305_vec.cc
index 100bfb7..20f77f2 100644
--- a/crypto/poly1305/poly1305_vec.cc
+++ b/crypto/poly1305/poly1305_vec.cc
@@ -85,7 +85,7 @@
               "poly1305_state_internal_t");
 
 poly1305_state_internal *poly1305_aligned_state(poly1305_state *state) {
-  return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
+  return reinterpret_cast<poly1305_state_internal *>(align_pointer(state, 64));
 }
 
 size_t poly1305_min(size_t a, size_t b) { return (a < b) ? a : b; }