Remove custom memcpy and memset from poly1305_vec.

This avoids upsetting the C compiler. UBSan is offended by the alignment
violations in those functions. The business with offset is also
undefined behavior (pointer arithmetic is supposed to stay within a
single object).

There is a small performance cost, however:

Before:
Did 6636000 ChaCha20-Poly1305 (16 bytes) seal operations in 5000475us (1327073.9 ops/sec): 21.2 MB/s
Did 832000 ChaCha20-Poly1305 (1350 bytes) seal operations in 5003481us (166284.2 ops/sec): 224.5 MB/s
Did 155000 ChaCha20-Poly1305 (8192 bytes) seal operations in 5026933us (30833.9 ops/sec): 252.6 MB/s

After:
Did 6508000 ChaCha20-Poly1305 (16 bytes) seal operations in 5000160us (1301558.4 ops/sec): 20.8 MB/s
Did 831000 ChaCha20-Poly1305 (1350 bytes) seal operations in 5002865us (166104.8 ops/sec): 224.2 MB/s
Did 155000 ChaCha20-Poly1305 (8192 bytes) seal operations in 5013204us (30918.4 ops/sec): 253.3 MB/s

(Tested with the no-asm build which disables the custom stitched mode
assembly and ends up using this one.)

Change-Id: I76d74183f1e04ad3726463a8871ee64be04ce674
Reviewed-on: https://boringssl-review.googlesource.com/22784
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/poly1305/poly1305_vec.c b/crypto/poly1305/poly1305_vec.c
index 80eaa36..480d9e5 100644
--- a/crypto/poly1305/poly1305_vec.c
+++ b/crypto/poly1305/poly1305_vec.c
@@ -85,57 +85,6 @@
   return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
 }
 
-// copy 0-63 bytes
-static inline void
-poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) {
-  size_t offset = src - dst;
-  if (bytes & 32) {
-    _mm_storeu_si128((xmmi *)(dst + 0),
-                     _mm_loadu_si128((const xmmi *)(dst + offset + 0)));
-    _mm_storeu_si128((xmmi *)(dst + 16),
-                     _mm_loadu_si128((const xmmi *)(dst + offset + 16)));
-    dst += 32;
-  }
-  if (bytes & 16) {
-    _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((const xmmi *)(dst + offset)));
-    dst += 16;
-  }
-  if (bytes & 8) {
-    *(uint64_t *)dst = *(const uint64_t *)(dst + offset);
-    dst += 8;
-  }
-  if (bytes & 4) {
-    *(uint32_t *)dst = *(const uint32_t *)(dst + offset);
-    dst += 4;
-  }
-  if (bytes & 2) {
-    *(uint16_t *)dst = *(uint16_t *)(dst + offset);
-    dst += 2;
-  }
-  if (bytes & 1) {
-    *(uint8_t *)dst = *(uint8_t *)(dst + offset);
-  }
-}
-
-// zero 0-15 bytes
-static inline void poly1305_block_zero(uint8_t *dst, size_t bytes) {
-  if (bytes & 8) {
-    *(uint64_t *)dst = 0;
-    dst += 8;
-  }
-  if (bytes & 4) {
-    *(uint32_t *)dst = 0;
-    dst += 4;
-  }
-  if (bytes & 2) {
-    *(uint16_t *)dst = 0;
-    dst += 2;
-  }
-  if (bytes & 1) {
-    *(uint8_t *)dst = 0;
-  }
-}
-
 static inline size_t poly1305_min(size_t a, size_t b) {
   return (a < b) ? a : b;
 }
@@ -721,7 +670,7 @@
       bytes -= 32;
     } else {
       want = poly1305_min(32 - st->leftover, bytes);
-      poly1305_block_copy(st->buffer + st->leftover, m, want);
+      OPENSSL_memcpy(st->buffer + st->leftover, m, want);
       bytes -= want;
       m += want;
       st->leftover += want;
@@ -737,7 +686,7 @@
   // handle leftover
   if (st->leftover) {
     want = poly1305_min(64 - st->leftover, bytes);
-    poly1305_block_copy(st->buffer + st->leftover, m, want);
+    OPENSSL_memcpy(st->buffer + st->leftover, m, want);
     bytes -= want;
     m += want;
     st->leftover += want;
@@ -757,7 +706,7 @@
   }
 
   if (bytes) {
-    poly1305_block_copy(st->buffer + st->leftover, m, bytes);
+    OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
     st->leftover += bytes;
   }
 }
@@ -833,7 +782,7 @@
   }
 
   m[leftover++] = 1;
-  poly1305_block_zero(m + leftover, 16 - leftover);
+  OPENSSL_memset(m + leftover, 0, 16 - leftover);
   leftover = 16;
 
   t0 = U8TO64_LE(m + 0);