Use a mix of bsaes and vpaes for CTR on NEON. tl;dr: AES is now constant-time on 32-bit ARM with NEON. Combined with all the past work, we now have constant-time AES and GHASH on ARM and x86 chips, 32-bit and 64-bit, provided NEON (required by Chrome on Android, aside from https://crbug.com/341598) or SSSE3 (almost all Chrome on Windows users) is available! CTR-like bsaes modes is harder to resolve than CBC decryption. They use both bulk (ctr128_f) and one-off (block128_f) operations. We currently use ctr128_f of bsaes and block128_f of aes_nohw (not constant-time), which hits 22.0 MB/s on my test chip. Implement a vpaes/bsaes hybrid to get the best of both worlds. The key is kept in vpaes form and, when the input is large enough, we convert the key to bsaes on-demand. This retains bsaes performance, but with no variable-time gaps. Alternatives considered: - Convert to bsaes form immediately and only use bsaes. This makes the one-off block128_f calls very expensive. One 8-block batch of bsaes_ctr32_encrypt_blocks costs as much as 5.76 vpaes_encrypt calls. - Do the above, but fold the one-off calls into bsaes batches because GCM is parallelizable. This is a mess with the current internal structure and doesn't apply to, e.g., CCM. - Drop bsaes in favor of vpaes. However, even with vpaes_ctr32_encrypt_blocks, vpaes is 15.5 MB/s. The hybrid is a 40% win on an important platform. - Try to narrow the gap, as we did for x86_64, with a "2x" optimization. I attempted this here but the register pressure was tricky. (x86_64 was already tight and NEON can't address memory in vtbl.) If I ignored this (gives wrong answer), the gap was still 20-25%. Perf here is slower overall (20 MB/s for old ARM vs 120-140 MB/s for old x86_64), so that gap is scarier. I retained vpaes_ctr32_encrypt_blocks because it's fairly compact (only 84 bytes assembled), though it's less important in the bsaes hybrid. Cortex-A53 (Raspberry Pi 3 Model B+) Before: Did 267000 AES-128-GCM (16 bytes) seal operations in 2004871us (133175.7 ops/sec): 2.1 MB/s Did 135000 AES-128-GCM (256 bytes) seal operations in 2013825us (67036.6 ops/sec): 17.2 MB/s Did 31000 AES-128-GCM (1350 bytes) seal operations in 2059039us (15055.6 ops/sec): 20.3 MB/s Did 5565 AES-128-GCM (8192 bytes) seal operations in 2073607us (2683.7 ops/sec): 22.0 MB/s Did 2709 AES-128-GCM (16384 bytes) seal operations in 2020264us (1340.9 ops/sec): 22.0 MB/s Did 209000 AES-256-GCM (16 bytes) seal operations in 2005654us (104205.4 ops/sec): 1.7 MB/s Did 109000 AES-256-GCM (256 bytes) seal operations in 2011293us (54194.0 ops/sec): 13.9 MB/s Did 25000 AES-256-GCM (1350 bytes) seal operations in 2082385us (12005.5 ops/sec): 16.2 MB/s Did 4452 AES-256-GCM (8192 bytes) seal operations in 2080729us (2139.6 ops/sec): 17.5 MB/s Did 2226 AES-256-GCM (16384 bytes) seal operations in 2079819us (1070.3 ops/sec): 17.5 MB/s After: Did 542000 AES-128-GCM (16 bytes) seal operations in 2003408us (270539.0 ops/sec): 4.3 MB/s [+104.8%] Did 124000 AES-128-GCM (256 bytes) seal operations in 2012579us (61612.5 ops/sec): 15.8 MB/s [-8.1%] Did 30000 AES-128-GCM (1350 bytes) seal operations in 2020636us (14846.8 ops/sec): 20.0 MB/s [-1.5%] Did 5502 AES-128-GCM (8192 bytes) seal operations in 2068807us (2659.5 ops/sec): 21.8 MB/s [-0.9%] Did 2772 AES-128-GCM (16384 bytes) seal operations in 2085176us (1329.4 ops/sec): 21.8 MB/s [-0.9%] Did 459000 AES-256-GCM (16 bytes) seal operations in 2003587us (229089.1 ops/sec): 3.7 MB/s [+117.6%] Did 100000 AES-256-GCM (256 bytes) seal operations in 2018311us (49546.4 ops/sec): 12.7 MB/s [-8.6%] Did 24000 AES-256-GCM (1350 bytes) seal operations in 2026975us (11840.3 ops/sec): 16.0 MB/s [-1.2%] Did 4410 AES-256-GCM (8192 bytes) seal operations in 2079581us (2120.6 ops/sec): 17.4 MB/s [-0.6%] Did 2226 AES-256-GCM (16384 bytes) seal operations in 2099318us (1060.3 ops/sec): 17.4 MB/s [-0.6%] Bug: 256 Change-Id: Ib74ab7e63974d3ddae8ce5fc35c9b44e73dce305 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/37429 Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl index d147c96..f9fadda 100644 --- a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl +++ b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
@@ -1281,6 +1281,65 @@ ___ } +{ +# Register-passed parameters. +my ($inp, $out, $len, $key) = map("r$_", 0..3); +# Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and +# $tmp. $ctr is r7 because it must be preserved across calls. +my ($ctr, $ivec, $tmp) = map("r$_", 7..9); + +# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, +# const AES_KEY *key, const uint8_t ivec[16]); +$code .= <<___; +.globl vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + mov ip, sp + stmdb sp!, {r7-r11, lr} + @ This function uses q4-q7 (d8-d15), which are callee-saved. + vstmdb sp!, {d8-d15} + + cmp $len, #0 + @ $ivec is passed on the stack. + ldr $ivec, [ip] + beq .Lctr32_done + + @ _vpaes_encrypt_core expects the key in r2, so swap $len and $key. + mov $tmp, $key + mov $key, $len + mov $len, $tmp +___ +my ($len, $key) = ($key, $len); +$code .= <<___; + + @ Load the IV and counter portion. + ldr $ctr, [$ivec, #12] + vld1.8 {q7}, [$ivec] + + bl _vpaes_preheat + rev $ctr, $ctr @ The counter is big-endian. + +.Lctr32_loop: + vmov q0, q7 + vld1.8 {q6}, [$inp]! @ Load input ahead of time + bl _vpaes_encrypt_core + veor q0, q0, q6 @ XOR input and result + vst1.8 {q0}, [$out]! + subs $len, $len, #1 + @ Update the counter. + add $ctr, $ctr, #1 + rev $tmp, $ctr + vmov.32 q7#hi[1], $tmp + bne .Lctr32_loop + +.Lctr32_done: + vldmia sp!, {d8-d15} + ldmia sp!, {r7-r11, pc} @ return +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +___ +} + foreach (split("\n",$code)) { s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; print $_,"\n";
diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h index 8471a80..99d509a 100644 --- a/crypto/fipsmodule/aes/internal.h +++ b/crypto/fipsmodule/aes/internal.h
@@ -51,6 +51,7 @@ #if defined(OPENSSL_ARM) #define BSAES #define VPAES +#define VPAES_CTR32 OPENSSL_INLINE int bsaes_capable(void) { return CRYPTO_is_NEON_capable(); } OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); } #endif
diff --git a/crypto/fipsmodule/cipher/e_aes.c b/crypto/fipsmodule/cipher/e_aes.c index 685d2db..72910ed 100644 --- a/crypto/fipsmodule/cipher/e_aes.c +++ b/crypto/fipsmodule/cipher/e_aes.c
@@ -68,6 +68,48 @@ OPENSSL_MSVC_PRAGMA(warning(push)) OPENSSL_MSVC_PRAGMA(warning(disable: 4702)) // Unreachable code. +#if defined(BSAES) +static void vpaes_ctr32_encrypt_blocks_with_bsaes(const uint8_t *in, + uint8_t *out, size_t blocks, + const AES_KEY *key, + const uint8_t ivec[16]) { + // |bsaes_ctr32_encrypt_blocks| is faster than |vpaes_ctr32_encrypt_blocks|, + // but it takes at least one full 8-block batch to amortize the conversion. + if (blocks < 8) { + vpaes_ctr32_encrypt_blocks(in, out, blocks, key, ivec); + return; + } + + size_t bsaes_blocks = blocks; + if (bsaes_blocks % 8 < 6) { + // |bsaes_ctr32_encrypt_blocks| internally works in 8-block batches. If the + // final batch is too small (under six blocks), it is faster to loop over + // |vpaes_encrypt|. Round |bsaes_blocks| down to a multiple of 8. + bsaes_blocks -= bsaes_blocks % 8; + } + + AES_KEY bsaes; + vpaes_encrypt_key_to_bsaes(&bsaes, key); + bsaes_ctr32_encrypt_blocks(in, out, bsaes_blocks, &bsaes, ivec); + OPENSSL_cleanse(&bsaes, sizeof(bsaes)); + + in += 16 * bsaes_blocks; + out += 16 * bsaes_blocks; + blocks -= bsaes_blocks; + + union { + uint32_t u32[4]; + uint8_t u8[16]; + } new_ivec; + memcpy(new_ivec.u8, ivec, 16); + uint32_t ctr = CRYPTO_bswap4(new_ivec.u32[3]) + bsaes_blocks; + new_ivec.u32[3] = CRYPTO_bswap4(ctr); + + // Finish any remaining blocks with |vpaes_ctr32_encrypt_blocks|. + vpaes_ctr32_encrypt_blocks(in, out, blocks, key, new_ivec.u8); +} +#endif // BSAES + typedef struct { union { double align; @@ -110,6 +152,7 @@ dat->stream.cbc = aes_hw_cbc_encrypt; } } else if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) { + assert(vpaes_capable()); ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks); if (ret == 0) { vpaes_decrypt_key_to_bsaes(&dat->ks.ks, &dat->ks.ks); @@ -145,11 +188,6 @@ } else if (mode == EVP_CIPH_CTR_MODE) { dat->stream.ctr = aes_hw_ctr32_encrypt_blocks; } - } else if (bsaes_capable() && mode == EVP_CIPH_CTR_MODE) { - ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); - // If |dat->stream.ctr| is provided, |dat->block| is never used. - dat->block = NULL; - dat->stream.ctr = bsaes_ctr32_encrypt_blocks; } else if (vpaes_capable()) { ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); dat->block = vpaes_encrypt; @@ -159,11 +197,14 @@ dat->stream.cbc = vpaes_cbc_encrypt; } #endif -#if defined(VPAES_CTR32) if (mode == EVP_CIPH_CTR_MODE) { +#if defined(BSAES) + assert(bsaes_capable()); + dat->stream.ctr = vpaes_ctr32_encrypt_blocks_with_bsaes; +#elif defined(VPAES_CTR32) dat->stream.ctr = vpaes_ctr32_encrypt_blocks; - } #endif + } } else { ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); dat->block = aes_nohw_encrypt; @@ -252,17 +293,6 @@ return aes_hw_ctr32_encrypt_blocks; } - if (bsaes_capable()) { - aes_nohw_set_encrypt_key(key, key_bytes * 8, aes_key); - if (gcm_key != NULL) { - CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_nohw_encrypt, 0); - } - if (out_block) { - *out_block = aes_nohw_encrypt; - } - return bsaes_ctr32_encrypt_blocks; - } - if (vpaes_capable()) { vpaes_set_encrypt_key(key, key_bytes * 8, aes_key); if (out_block) { @@ -271,7 +301,10 @@ if (gcm_key != NULL) { CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt, 0); } -#if defined(VPAES_CTR32) +#if defined(BSAES) + assert(bsaes_capable()); + return vpaes_ctr32_encrypt_blocks_with_bsaes; +#elif defined(VPAES_CTR32) return vpaes_ctr32_encrypt_blocks; #else return NULL;