Remove custom memcpy and memset from poly1305_vec.
This avoids upsetting the C compiler. UBSan is offended by the alignment
violations in those functions. The business with offset is also
undefined behavior (pointer arithmetic is supposed to stay within a
single object).
There is a small performance cost, however:
Before:
Did 6636000 ChaCha20-Poly1305 (16 bytes) seal operations in 5000475us (1327073.9 ops/sec): 21.2 MB/s
Did 832000 ChaCha20-Poly1305 (1350 bytes) seal operations in 5003481us (166284.2 ops/sec): 224.5 MB/s
Did 155000 ChaCha20-Poly1305 (8192 bytes) seal operations in 5026933us (30833.9 ops/sec): 252.6 MB/s
After:
Did 6508000 ChaCha20-Poly1305 (16 bytes) seal operations in 5000160us (1301558.4 ops/sec): 20.8 MB/s
Did 831000 ChaCha20-Poly1305 (1350 bytes) seal operations in 5002865us (166104.8 ops/sec): 224.2 MB/s
Did 155000 ChaCha20-Poly1305 (8192 bytes) seal operations in 5013204us (30918.4 ops/sec): 253.3 MB/s
(Tested with the no-asm build which disables the custom stitched mode
assembly and ends up using this one.)
Change-Id: I76d74183f1e04ad3726463a8871ee64be04ce674
Reviewed-on: https://boringssl-review.googlesource.com/22784
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/poly1305/poly1305_vec.c b/crypto/poly1305/poly1305_vec.c
index 80eaa36..480d9e5 100644
--- a/crypto/poly1305/poly1305_vec.c
+++ b/crypto/poly1305/poly1305_vec.c
@@ -85,57 +85,6 @@
return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
}
-// copy 0-63 bytes
-static inline void
-poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) {
- size_t offset = src - dst;
- if (bytes & 32) {
- _mm_storeu_si128((xmmi *)(dst + 0),
- _mm_loadu_si128((const xmmi *)(dst + offset + 0)));
- _mm_storeu_si128((xmmi *)(dst + 16),
- _mm_loadu_si128((const xmmi *)(dst + offset + 16)));
- dst += 32;
- }
- if (bytes & 16) {
- _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((const xmmi *)(dst + offset)));
- dst += 16;
- }
- if (bytes & 8) {
- *(uint64_t *)dst = *(const uint64_t *)(dst + offset);
- dst += 8;
- }
- if (bytes & 4) {
- *(uint32_t *)dst = *(const uint32_t *)(dst + offset);
- dst += 4;
- }
- if (bytes & 2) {
- *(uint16_t *)dst = *(uint16_t *)(dst + offset);
- dst += 2;
- }
- if (bytes & 1) {
- *(uint8_t *)dst = *(uint8_t *)(dst + offset);
- }
-}
-
-// zero 0-15 bytes
-static inline void poly1305_block_zero(uint8_t *dst, size_t bytes) {
- if (bytes & 8) {
- *(uint64_t *)dst = 0;
- dst += 8;
- }
- if (bytes & 4) {
- *(uint32_t *)dst = 0;
- dst += 4;
- }
- if (bytes & 2) {
- *(uint16_t *)dst = 0;
- dst += 2;
- }
- if (bytes & 1) {
- *(uint8_t *)dst = 0;
- }
-}
-
static inline size_t poly1305_min(size_t a, size_t b) {
return (a < b) ? a : b;
}
@@ -721,7 +670,7 @@
bytes -= 32;
} else {
want = poly1305_min(32 - st->leftover, bytes);
- poly1305_block_copy(st->buffer + st->leftover, m, want);
+ OPENSSL_memcpy(st->buffer + st->leftover, m, want);
bytes -= want;
m += want;
st->leftover += want;
@@ -737,7 +686,7 @@
// handle leftover
if (st->leftover) {
want = poly1305_min(64 - st->leftover, bytes);
- poly1305_block_copy(st->buffer + st->leftover, m, want);
+ OPENSSL_memcpy(st->buffer + st->leftover, m, want);
bytes -= want;
m += want;
st->leftover += want;
@@ -757,7 +706,7 @@
}
if (bytes) {
- poly1305_block_copy(st->buffer + st->leftover, m, bytes);
+ OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
st->leftover += bytes;
}
}
@@ -833,7 +782,7 @@
}
m[leftover++] = 1;
- poly1305_block_zero(m + leftover, 16 - leftover);
+ OPENSSL_memset(m + leftover, 0, 16 - leftover);
leftover = 16;
t0 = U8TO64_LE(m + 0);