aesv8-armx.pl: avoid 32-bit lane assignment in CTR mode
ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are affected
by silicon errata #1742098 [0] and #1655431 [1], respectively, where the
second instruction of a AES instruction pair may execute twice if an
interrupt is taken right after the first instruction consumes an input
register of which a single 32-bit lane has been updated the last time it
was modified.
Shuffle the counter assignments around a bit so that the most recent
updates when the AES instruction pair executes are 128-bit wide.
[0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
[1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
(This is imported from upstream's
409c59e8f44ae56f2587cdd8a7ce611d0e3d91d9.)
The change is applied to both 32-bit and 64-bit for simplicity, but there
was no measurable performance difference, so leaving them aligned is
easiest.
Change-Id: Ic8e5f656f59ae8c2ecb2762a066c2c9064bb34c5
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/44284
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aesv8-armx.pl b/crypto/fipsmodule/aes/asm/aesv8-armx.pl
index a2825ce..1a2c20b 100644
--- a/crypto/fipsmodule/aes/asm/aesv8-armx.pl
+++ b/crypto/fipsmodule/aes/asm/aesv8-armx.pl
@@ -748,20 +748,34 @@
add $key_,$key,#32
mov $cnt,$rounds
cclr $step,lo
+
+ // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+ // affected by silicon errata #1742098 [0] and #1655431 [1],
+ // respectively, where the second instruction of an aese/aesmc
+ // instruction pair may execute twice if an interrupt is taken right
+ // after the first instruction consumes an input register of which a
+ // single 32-bit lane has been updated the last time it was modified.
+ //
+ // This function uses a counter in one 32-bit lane. The vmov.32 lines
+ // could write to $dat1 and $dat2 directly, but that trips this bugs.
+ // We write to $ivec and copy to the final register as a workaround.
+ //
+ // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+ // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
#ifndef __ARMEB__
rev $ctr, $ctr
#endif
- vorr $dat1,$dat0,$dat0
add $tctr1, $ctr, #1
- vorr $dat2,$dat0,$dat0
- add $ctr, $ctr, #2
vorr $ivec,$dat0,$dat0
rev $tctr1, $tctr1
- vmov.32 ${dat1}[3],$tctr1
+ vmov.32 ${ivec}[3],$tctr1
+ add $ctr, $ctr, #2
+ vorr $dat1,$ivec,$ivec
b.ls .Lctr32_tail
rev $tctr2, $ctr
+ vmov.32 ${ivec}[3],$tctr2
sub $len,$len,#3 // bias
- vmov.32 ${dat2}[3],$tctr2
+ vorr $dat2,$ivec,$ivec
b .Loop3x_ctr32
.align 4
@@ -788,11 +802,11 @@
aese $dat1,q8
aesmc $tmp1,$dat1
vld1.8 {$in0},[$inp],#16
- vorr $dat0,$ivec,$ivec
+ add $tctr0,$ctr,#1
aese $dat2,q8
aesmc $dat2,$dat2
vld1.8 {$in1},[$inp],#16
- vorr $dat1,$ivec,$ivec
+ rev $tctr0,$tctr0
aese $tmp0,q9
aesmc $tmp0,$tmp0
aese $tmp1,q9
@@ -801,8 +815,6 @@
mov $key_,$key
aese $dat2,q9
aesmc $tmp2,$dat2
- vorr $dat2,$ivec,$ivec
- add $tctr0,$ctr,#1
aese $tmp0,q12
aesmc $tmp0,$tmp0
aese $tmp1,q12
@@ -817,21 +829,26 @@
aesmc $tmp0,$tmp0
aese $tmp1,q13
aesmc $tmp1,$tmp1
+ // Note the logic to update $dat0, $dat1, and $dat1 is written to work
+ // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+ // 32-bit mode. See the comment above.
veor $in2,$in2,$rndlast
- rev $tctr0,$tctr0
+ vmov.32 ${ivec}[3], $tctr0
aese $tmp2,q13
aesmc $tmp2,$tmp2
- vmov.32 ${dat0}[3], $tctr0
+ vorr $dat0,$ivec,$ivec
rev $tctr1,$tctr1
aese $tmp0,q14
aesmc $tmp0,$tmp0
+ vmov.32 ${ivec}[3], $tctr1
+ rev $tctr2,$ctr
aese $tmp1,q14
aesmc $tmp1,$tmp1
- vmov.32 ${dat1}[3], $tctr1
- rev $tctr2,$ctr
+ vorr $dat1,$ivec,$ivec
+ vmov.32 ${ivec}[3], $tctr2
aese $tmp2,q14
aesmc $tmp2,$tmp2
- vmov.32 ${dat2}[3], $tctr2
+ vorr $dat2,$ivec,$ivec
subs $len,$len,#3
aese $tmp0,q15
aese $tmp1,q15