Adapt gcm_*_neon to aarch64.
This makes AES-GCM always constant-time on aarch64 (provided assembly is
enabled). Unlike vpaes, this does come at a binary size penalty of 1K
compared to the gcm_*_4bit version.
ABI testing already covered by GCMTest.ABI (GHASH_ASM_ARM covers both
OPENSSL_ARM and OPENSSL_AARCH64.)
Cortex-A53 (Raspberry Pi 3 Model B+)
Before:
Did 274000 AES-128-GCM (16 bytes) seal operations in 1003461us (273055.0 ops/sec): 4.4 MB/s
Did 53000 AES-128-GCM (256 bytes) seal operations in 1007689us (52595.6 ops/sec): 13.5 MB/s
Did 12000 AES-128-GCM (1350 bytes) seal operations in 1075908us (11153.4 ops/sec): 15.1 MB/s
Did 2068 AES-128-GCM (8192 bytes) seal operations in 1089037us (1898.9 ops/sec): 15.6 MB/s
After:
Did 298000 AES-128-GCM (16 bytes) seal operations in 1002917us (297133.3 ops/sec): 4.8 MB/s
Did 64000 AES-128-GCM (256 bytes) seal operations in 1001124us (63928.1 ops/sec): 16.4 MB/s
Did 14000 AES-128-GCM (1350 bytes) seal operations in 1015477us (13786.6 ops/sec): 18.6 MB/s
Did 2497 AES-128-GCM (8192 bytes) seal operations in 1057951us (2360.2 ops/sec): 19.3 MB/s
Bug: 265
Change-Id: I251bf0f2eae0578580bb14192755e5d8ff64cd14
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35285
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index 09d210b..fbf25ac88 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@@ -65,6 +65,7 @@
aesv8-armx.${ASM_EXT}
armv8-mont.${ASM_EXT}
+ ghash-neon-armv8.${ASM_EXT}
ghashv8-armx.${ASM_EXT}
sha1-armv8.${ASM_EXT}
sha256-armv8.${ASM_EXT}
@@ -99,6 +100,7 @@
perlasm(ghash-armv4.${ASM_EXT} modes/asm/ghash-armv4.pl)
perlasm(ghashp8-ppc.${ASM_EXT} modes/asm/ghashp8-ppc.pl)
perlasm(ghashv8-armx.${ASM_EXT} modes/asm/ghashv8-armx.pl)
+perlasm(ghash-neon-armv8.${ASM_EXT} modes/asm/ghash-neon-armv8.pl)
perlasm(ghash-ssse3-x86_64.${ASM_EXT} modes/asm/ghash-ssse3-x86_64.pl)
perlasm(ghash-ssse3-x86.${ASM_EXT} modes/asm/ghash-ssse3-x86.pl)
perlasm(ghash-x86_64.${ASM_EXT} modes/asm/ghash-x86_64.pl)
diff --git a/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
new file mode 100644
index 0000000..972be41
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
@@ -0,0 +1,287 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
+# implements the multiplication algorithm described in:
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+#
+# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
+# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
+# NEON, the low and high halves of the 128-bit register q0 are accessible as
+# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
+# vN. Where the 32-bit version would use the upper half, this file must keep
+# halves in separate registers.
+#
+# The other distinction is in syntax. 32-bit NEON embeds lane information in the
+# instruction name, while AArch64 uses suffixes on the registers. For instance,
+# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
+#
+# vshl.i64 q0, q0, #1
+#
+# in 64-bit, it would be written:
+#
+# shl v0.2d, v0.2d, #1
+#
+# See Programmer's Guide for ARMv8-A, section 7 for details.
+# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
+#
+# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
+# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
+# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
+# polynomial and is conditioned on the PMULL extension. This file emulates the
+# latter with the former.
+
+use strict;
+
+my $flavour = shift;
+my $output;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/;
+ my $dir = $1;
+ my $xlate;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block
+my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
+my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
+# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
+# to spare.
+my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
+my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
+my ($k48_k32, $k16_k0) = map("v$_", (24..25));
+
+my $code = "";
+
+# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
+# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
+sub clmul64x64 {
+my ($r, $a, $b) = @_;
+$code .= <<___;
+ ext $t0.8b, $a.8b, $a.8b, #1 // A1
+ pmull $t0.8h, $t0.8b, $b.8b // F = A1*B
+ ext $r.8b, $b.8b, $b.8b, #1 // B1
+ pmull $r.8h, $a.8b, $r.8b // E = A*B1
+ ext $t1.8b, $a.8b, $a.8b, #2 // A2
+ pmull $t1.8h, $t1.8b, $b.8b // H = A2*B
+ ext $t3.8b, $b.8b, $b.8b, #2 // B2
+ pmull $t3.8h, $a.8b, $t3.8b // G = A*B2
+ ext $t2.8b, $a.8b, $a.8b, #3 // A3
+ eor $t0.16b, $t0.16b, $r.16b // L = E + F
+ pmull $t2.8h, $t2.8b, $b.8b // J = A3*B
+ ext $r.8b, $b.8b, $b.8b, #3 // B3
+ eor $t1.16b, $t1.16b, $t3.16b // M = G + H
+ pmull $r.8h, $a.8b, $r.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L)
+ // vand \$t0#hi, \$t0#hi, \$k48
+ // veor \$t0#lo, \$t0#lo, \$t0#hi
+ //
+ // veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M)
+ // vand \$t1#hi, \$t1#hi, \$k32
+ // veor \$t1#lo, \$t1#lo, \$t1#hi
+ //
+ // veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N)
+ // vand \$t2#hi, \$t2#hi, \$k16
+ // veor \$t2#lo, \$t2#lo, \$t2#hi
+ //
+ // veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 \$t3#hi, #0
+ //
+ // \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext $t3.8b, $b.8b, $b.8b, #4 // B4
+ eor $t2.16b, $t2.16b, $r.16b // N = I + J
+ pmull $t3.8h, $a.8b, $t3.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 $t0l_t1l.2d, $t0.2d, $t1.2d
+ zip1 $t2l_t3l.2d, $t2.2d, $t3.2d
+ zip2 $t0h_t1h.2d, $t0.2d, $t1.2d
+ zip2 $t2h_t3h.2d, $t2.2d, $t3.2d
+ eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+ eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+ and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
+ and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
+ eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+ eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+ zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
+ zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
+ zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
+ zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
+
+ ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8
+ ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16
+ pmull $r.8h, $a.8b, $b.8b // D = A*B
+ ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32
+ ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24
+ eor $t0.16b, $t0.16b, $t1.16b
+ eor $t2.16b, $t2.16b, $t3.16b
+ eor $r.16b, $r.16b, $t0.16b
+ eor $r.16b, $r.16b, $t2.16b
+___
+}
+
+$code .= <<___;
+.text
+
+.global gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ // This function is adapted from gcm_init_v8. xC2 is t3.
+ ld1 {$t1.2d}, [x1] // load H
+ movi $t3.16b, #0xe1
+ shl $t3.2d, $t3.2d, #57 // 0xc2.0
+ ext $INlo.16b, $t1.16b, $t1.16b, #8
+ ushr $t2.2d, $t3.2d, #63
+ dup $t1.4s, $t1.s[1]
+ ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01
+ ushr $t2.2d, $INlo.2d, #63
+ sshr $t1.4s, $t1.4s, #31 // broadcast carry bit
+ and $t2.16b, $t2.16b, $t0.16b
+ shl $INlo.2d, $INlo.2d, #1
+ ext $t2.16b, $t2.16b, $t2.16b, #8
+ and $t0.16b, $t0.16b, $t1.16b
+ orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1
+ eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H
+ st1 {$Hlo.2d}, [x0] // store Htable[0]
+ ret
+.size gcm_init_neon,.-gcm_init_neon
+
+.global gcm_gmult_neon
+.type gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+ ld1 {$INlo.16b}, [$Xi] // load Xi
+ ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H
+ ld1 {$Hhi.1d}, [$Htbl]
+ adrp x9, :pg_hi21:.Lmasks // load constants
+ add x9, x9, :lo12:.Lmasks
+ ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]
+ rev64 $INlo.16b, $INlo.16b // byteswap Xi
+ ext $INlo.16b, $INlo.16b, $INlo.16b, #8
+ eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing
+
+ mov $len, #16
+ b .Lgmult_neon
+.size gcm_gmult_neon,.-gcm_gmult_neon
+
+.global gcm_ghash_neon
+.type gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+ ld1 {$Xl.16b}, [$Xi] // load Xi
+ ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H
+ ld1 {$Hhi.1d}, [$Htbl]
+ adrp x9, :pg_hi21:.Lmasks // load constants
+ add x9, x9, :lo12:.Lmasks
+ ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]
+ rev64 $Xl.16b, $Xl.16b // byteswap Xi
+ ext $Xl.16b, $Xl.16b, $Xl.16b, #8
+ eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing
+
+.Loop_neon:
+ ld1 {$INlo.16b}, [$inp], #16 // load inp
+ rev64 $INlo.16b, $INlo.16b // byteswap inp
+ ext $INlo.16b, $INlo.16b, $INlo.16b, #8
+ eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi
+
+.Lgmult_neon:
+ // Split the input into $INlo and $INhi. (The upper halves are unused,
+ // so it is okay to leave them alone.)
+ ins $INhi.d[0], $INlo.d[1]
+___
+&clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo
+$code .= <<___;
+ eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing
+___
+&clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
+&clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi
+$code .= <<___;
+ ext $t0.16b, $Xl.16b, $Xh.16b, #8
+ eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing
+ eor $Xm.16b, $Xm.16b, $Xh.16b
+ eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi
+ ins $Xl.d[1], $Xm.d[0] // Xh|Xl - 256-bit result
+ // This is a no-op due to the ins instruction below.
+ // ins $Xh.d[0], $Xm.d[1]
+
+ // equivalent of reduction_avx from ghash-x86_64.pl
+ shl $t1.2d, $Xl.2d, #57 // 1st phase
+ shl $t2.2d, $Xl.2d, #62
+ eor $t2.16b, $t2.16b, $t1.16b //
+ shl $t1.2d, $Xl.2d, #63
+ eor $t2.16b, $t2.16b, $t1.16b //
+ // Note Xm contains {Xl.d[1], Xh.d[0]}.
+ eor $t2.16b, $t2.16b, $Xm.16b
+ ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0]
+ ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1]
+
+ ushr $t2.2d, $Xl.2d, #1 // 2nd phase
+ eor $Xh.16b, $Xh.16b,$Xl.16b
+ eor $Xl.16b, $Xl.16b,$t2.16b //
+ ushr $t2.2d, $t2.2d, #6
+ ushr $Xl.2d, $Xl.2d, #1 //
+ eor $Xl.16b, $Xl.16b, $Xh.16b //
+ eor $Xl.16b, $Xl.16b, $t2.16b //
+
+ subs $len, $len, #16
+ bne .Loop_neon
+
+ rev64 $Xl.16b, $Xl.16b // byteswap Xi and write
+ ext $Xl.16b, $Xl.16b, $Xl.16b, #8
+ st1 {$Xl.16b}, [$Xi]
+
+ ret
+.size gcm_ghash_neon,.-gcm_ghash_neon
+
+.section .rodata
+.align 4
+.Lmasks:
+.quad 0x0000ffffffffffff // k48
+.quad 0x00000000ffffffff // k32
+.quad 0x000000000000ffff // k16
+.quad 0x0000000000000000 // k0
+.asciz "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index 9a081eb..dec1e56 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -327,28 +327,12 @@
void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len);
-#if defined(OPENSSL_ARM)
-// 32-bit ARM also has support for doing GCM with NEON instructions.
OPENSSL_INLINE int gcm_neon_capable(void) { return CRYPTO_is_NEON_capable(); }
void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]);
void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]);
void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len);
-#else
-// AArch64 only has the ARMv8 versions of functions.
-OPENSSL_INLINE int gcm_neon_capable(void) { return 0; }
-OPENSSL_INLINE void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]) {
- abort();
-}
-OPENSSL_INLINE void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]) {
- abort();
-}
-OPENSSL_INLINE void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16],
- const uint8_t *inp, size_t len) {
- abort();
-}
-#endif // OPENSSL_ARM
#elif defined(OPENSSL_PPC64LE)
#define GHASH_ASM_PPC64LE