Enable vpaes for aarch64, with CTR optimizations.
This patches vpaes-armv8.pl to add vpaes_ctr32_encrypt_blocks. CTR mode
is by far the most important mode these days. It should have access to
_vpaes_encrypt_2x, which gives a considerable speed boost. Also exclude
vpaes_ecb_* as they're not even used.
For iOS, this change is completely a no-op. iOS ARMv8 always has crypto
extensions, and we already statically drop all other AES
implementations.
Android ARMv8 is *not* required to have crypto extensions, but every
ARMv8 device I've seen has them. For those, it is a no-op
performance-wise and a win on size. vpaes appears to be about 5.6KiB
smaller than the tables. ARMv8 always makes SIMD (NEON) available, so we
can statically drop aes_nohw.
In theory, however, crypto-less Android ARMv8 is possible. Today such
chips get a variable-time AES. This CL fixes this, but the performance
story is complex.
The Raspberry Pi 3 is not Android but has a Cortex-A53 chip
without crypto extensions. (But the official images are 32-bit, so even
this is slightly artificial...) There, vpaes is a performance win.
Raspberry Pi 3, Model B+, Cortex-A53
Before:
Did 265000 AES-128-GCM (16 bytes) seal operations in 1003312us (264125.2 ops/sec): 4.2 MB/s
Did 44000 AES-128-GCM (256 bytes) seal operations in 1002141us (43906.0 ops/sec): 11.2 MB/s
Did 9394 AES-128-GCM (1350 bytes) seal operations in 1032104us (9101.8 ops/sec): 12.3 MB/s
Did 1562 AES-128-GCM (8192 bytes) seal operations in 1008982us (1548.1 ops/sec): 12.7 MB/s
After:
Did 277000 AES-128-GCM (16 bytes) seal operations in 1001884us (276479.1 ops/sec): 4.4 MB/s
Did 52000 AES-128-GCM (256 bytes) seal operations in 1001480us (51923.2 ops/sec): 13.3 MB/s
Did 11000 AES-128-GCM (1350 bytes) seal operations in 1007979us (10912.9 ops/sec): 14.7 MB/s
Did 2013 AES-128-GCM (8192 bytes) seal operations in 1085545us (1854.4 ops/sec): 15.2 MB/s
The Pixel 3 has a Cortex-A75 with crypto extensions, so it would never
run this code. However, artificially ignoring them gives another data
point (ARM documentation[*] suggests the extensions are still optional
on a Cortex-A75.) Sadly, vpaes no longer wins on perf over aes_nohw.
But, it is constant-time:
Pixel 3, AES/PMULL extensions ignored, Cortex-A75:
Before:
Did 2102000 AES-128-GCM (16 bytes) seal operations in 1000378us (2101205.7 ops/sec): 33.6 MB/s
Did 358000 AES-128-GCM (256 bytes) seal operations in 1002658us (357051.0 ops/sec): 91.4 MB/s
Did 75000 AES-128-GCM (1350 bytes) seal operations in 1012830us (74049.9 ops/sec): 100.0 MB/s
Did 13000 AES-128-GCM (8192 bytes) seal operations in 1036524us (12541.9 ops/sec): 102.7 MB/s
After:
Did 1453000 AES-128-GCM (16 bytes) seal operations in 1000213us (1452690.6 ops/sec): 23.2 MB/s
Did 285000 AES-128-GCM (256 bytes) seal operations in 1002227us (284366.7 ops/sec): 72.8 MB/s
Did 60000 AES-128-GCM (1350 bytes) seal operations in 1016106us (59049.0 ops/sec): 79.7 MB/s
Did 11000 AES-128-GCM (8192 bytes) seal operations in 1094184us (10053.2 ops/sec): 82.4 MB/s
Note the numbers above run with PMULL off, so the slow GHASH is
dampening the regression. If we test aes_nohw and vpaes paired with
PMULL on, the 20% perf hit becomes a 31% hit. The PMULL-less variant is
more likely to represent a real chip.
This is consistent with upstream's note in the comment, though it is
unclear if 20% is the right order of magnitude: "these results are worse
than scalar compiler-generated code, but it's constant-time and
therefore preferred".
[*] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100458_0301_00_en/lau1442495529696.html
Bug: 246
Change-Id: If1dc87f5131fce742052498295476fbae4628dbf
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35026
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index 04b2ffb..09d210b 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@@ -69,6 +69,7 @@
sha1-armv8.${ASM_EXT}
sha256-armv8.${ASM_EXT}
sha512-armv8.${ASM_EXT}
+ vpaes-armv8.${ASM_EXT}
)
endif()
@@ -120,6 +121,7 @@
perlasm(sha512-armv4.${ASM_EXT} sha/asm/sha512-armv4.pl)
perlasm(sha512-armv8.${ASM_EXT} sha/asm/sha512-armv8.pl)
perlasm(sha512-x86_64.${ASM_EXT} sha/asm/sha512-x86_64.pl)
+perlasm(vpaes-armv8.${ASM_EXT} aes/asm/vpaes-armv8.pl)
perlasm(vpaes-x86_64.${ASM_EXT} aes/asm/vpaes-x86_64.pl)
perlasm(vpaes-x86.${ASM_EXT} aes/asm/vpaes-x86.pl)
perlasm(x86_64-mont5.${ASM_EXT} bn/asm/x86_64-mont5.pl)
diff --git a/crypto/fipsmodule/aes/aes_test.cc b/crypto/fipsmodule/aes/aes_test.cc
index a0c9411..2222b63 100644
--- a/crypto/fipsmodule/aes/aes_test.cc
+++ b/crypto/fipsmodule/aes/aes_test.cc
@@ -250,6 +250,9 @@
SCOPED_TRACE(blocks);
CHECK_ABI(vpaes_cbc_encrypt, buf, buf, AES_BLOCK_SIZE * blocks, &key,
block, AES_ENCRYPT);
+#if defined(VPAES_CTR32)
+ CHECK_ABI(vpaes_ctr32_encrypt_blocks, buf, buf, blocks, &key, block);
+#endif
}
CHECK_ABI(vpaes_set_decrypt_key, kKey, bits, &key);
diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
index 5131e13..49eaf0d 100755
--- a/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
+++ b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
@@ -42,7 +42,7 @@
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
@@ -1171,7 +1171,8 @@
ret
.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
___
-if (1) {
+# We omit vpaes_ecb_* in BoringSSL. They are unused.
+if (0) {
$code.=<<___;
.globl vpaes_ecb_encrypt
.type vpaes_ecb_encrypt,%function
@@ -1253,7 +1254,89 @@
ret
.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
___
-} }
+}
+
+my ($ctr, $ctr_tmp) = ("w6", "w7");
+
+# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
+# const AES_KEY *key, const uint8_t ivec[16]);
+$code.=<<___;
+.globl vpaes_ctr32_encrypt_blocks
+.type vpaes_ctr32_encrypt_blocks,%function
+.align 4
+vpaes_ctr32_encrypt_blocks:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ cbz $len, .Lctr32_done
+
+ // Note, unlike the other functions, $len here is measured in blocks,
+ // not bytes.
+ mov x17, $len
+ mov x2, $key
+
+ // Load the IV and counter portion.
+ ldr $ctr, [$ivec, #12]
+ ld1 {v7.16b}, [$ivec]
+
+ bl _vpaes_encrypt_preheat
+ tst x17, #1
+ rev $ctr, $ctr // The counter is big-endian.
+ b.eq .Lctr32_prep_loop
+
+ // Handle one block so the remaining block count is even for
+ // _vpaes_encrypt_2x.
+ ld1 {v6.16b}, [$inp], #16 // Load input ahead of time
+ bl _vpaes_encrypt_core
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ st1 {v0.16b}, [$out], #16
+ subs x17, x17, #1
+ // Update the counter.
+ add $ctr, $ctr, #1
+ rev $ctr_tmp, $ctr
+ mov v7.s[3], $ctr_tmp
+ b.ls .Lctr32_done
+
+.Lctr32_prep_loop:
+ // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+ // uses v14 and v15.
+ mov v15.16b, v7.16b
+ mov v14.16b, v7.16b
+ add $ctr, $ctr, #1
+ rev $ctr_tmp, $ctr
+ mov v15.s[3], $ctr_tmp
+
+.Lctr32_loop:
+ ld1 {v6.16b,v7.16b}, [$inp], #32 // Load input ahead of time
+ bl _vpaes_encrypt_2x
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
+ st1 {v0.16b,v1.16b}, [$out], #32
+ subs x17, x17, #2
+ // Update the counter.
+ add $ctr_tmp, $ctr, #1
+ add $ctr, $ctr, #2
+ rev $ctr_tmp, $ctr_tmp
+ mov v14.s[3], $ctr_tmp
+ rev $ctr_tmp, $ctr
+ mov v15.s[3], $ctr_tmp
+ b.hi .Lctr32_loop
+
+.Lctr32_done:
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ ret
+.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+___
+}
+
print $code;
close STDOUT;
diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h
index 0df30d9..a05abcb 100644
--- a/crypto/fipsmodule/aes/internal.h
+++ b/crypto/fipsmodule/aes/internal.h
@@ -35,13 +35,13 @@
}
#define VPAES
-OPENSSL_INLINE char vpaes_capable(void) {
+OPENSSL_INLINE int vpaes_capable(void) {
return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
}
#if defined(OPENSSL_X86_64)
#define BSAES
-OPENSSL_INLINE char bsaes_capable(void) { return vpaes_capable(); }
+OPENSSL_INLINE int bsaes_capable(void) { return vpaes_capable(); }
#endif // X86_64
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
@@ -51,7 +51,13 @@
#if defined(OPENSSL_ARM)
#define BSAES
-OPENSSL_INLINE char bsaes_capable(void) { return CRYPTO_is_NEON_capable(); }
+OPENSSL_INLINE int bsaes_capable(void) { return CRYPTO_is_NEON_capable(); }
+#endif
+
+#if defined(OPENSSL_AARCH64)
+#define VPAES
+#define VPAES_CTR32
+OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); }
#endif
#elif defined(OPENSSL_PPC64LE)
@@ -162,6 +168,10 @@
void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
const AES_KEY *key, uint8_t *ivec, int enc);
+#if defined(VPAES_CTR32)
+void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
+ const AES_KEY *key, const uint8_t ivec[16]);
+#endif
#else
OPENSSL_INLINE char vpaes_capable(void) { return 0; }
diff --git a/crypto/fipsmodule/cipher/e_aes.c b/crypto/fipsmodule/cipher/e_aes.c
index 460deed..51a1fb1 100644
--- a/crypto/fipsmodule/cipher/e_aes.c
+++ b/crypto/fipsmodule/cipher/e_aes.c
@@ -143,7 +143,15 @@
} else if (vpaes_capable()) {
ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = vpaes_encrypt;
- dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ? vpaes_cbc_encrypt : NULL;
+ dat->stream.cbc = NULL;
+ if (mode == EVP_CIPH_CBC_MODE) {
+ dat->stream.cbc = vpaes_cbc_encrypt;
+ }
+#if defined(VPAES_CTR32)
+ if (mode == EVP_CIPH_CTR_MODE) {
+ dat->stream.ctr = vpaes_ctr32_encrypt_blocks;
+ }
+#endif
} else {
ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = aes_nohw_encrypt;
@@ -253,7 +261,11 @@
if (gcm_key != NULL) {
CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt, 0);
}
+#if defined(VPAES_CTR32)
+ return vpaes_ctr32_encrypt_blocks;
+#else
return NULL;
+#endif
}
aes_nohw_set_encrypt_key(key, key_bytes * 8, aes_key);