Fix missing vzeroupper in gcm_ghash_vpclmulqdq_avx2() for len=16
gcm_ghash_vpclmulqdq_avx2() executes vzeroupper only when len >= 32, as
it was supposed to use only xmm registers for len == 16. But actually
it was writing to two ymm registers unconditionally for $BSWAP_MASK and
$GFPOLY. Therefore, there could be a slow-down in later code using
legacy SSE instructions, in the (probably rare) case where
gcm_ghash_vpclmulqdq_avx2() was called with len=16 *and* wasn't followed
by a function that does vzeroupper, e.g. aes_gcm_enc_update_vaes_avx2().
(The Windows xmm register restore epilogue of
gcm_ghash_vpclmulqdq_avx2() itself does use legacy SSE instructions, so
probably was slowed down by this, but that's just 8 instructions.)
Fix this by updating gcm_ghash_vpclmulqdq_avx2() to correctly use only
xmm registers when len=16. This makes it match the similar code in
aes-gcm-avx10-x86_64.pl, which does do it correctly, more closely.
Also, make both functions just execute vzeroupper unconditionally, so
that it won't be missed again. It's actually only 1 cycle on the CPUs
this code runs on, and it no longer seems worth executing conditionally.
Change-Id: I975cd5b526e5cdae1a567f4085c2484552bf6bea
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/77227
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
index e8cf3be..2f35323 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
@@ -810,9 +810,6 @@
jae .Laad_loop_1x$local_label_suffix
.Laad_large_done$local_label_suffix:
- # Issue the vzeroupper that is needed after using ymm or zmm registers.
- # Do it here instead of at the end, to minimize overhead for small AADLEN.
- vzeroupper
# GHASH the remaining data 16 bytes at a time, using xmm registers only.
.Laad_blockbyblock$local_label_suffix:
@@ -833,6 +830,8 @@
# Store the updated GHASH accumulator back to memory.
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+ vzeroupper # This is needed after using ymm or zmm registers.
___
return $code;
}
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
index ebbd0dd..deec309 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
@@ -490,15 +490,25 @@
@{[ _save_xmmregs (6 .. 9) ]}
.seh_endprologue
- vbroadcasti128 .Lbswap_mask(%rip), $BSWAP_MASK
+ # Load the bswap_mask and gfpoly constants. Since AADLEN is usually small,
+ # usually only 128-bit vectors will be used. So as an optimization, don't
+ # broadcast these constants to both 128-bit lanes quite yet.
+ vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK_XMM
+ vmovdqu .Lgfpoly(%rip), $GFPOLY_XMM
+
+ # Load the GHASH accumulator.
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
- vbroadcasti128 .Lgfpoly(%rip), $GFPOLY
# Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128.
cmp \$32, $AADLEN
jb .Lghash_lastblock
+ # AADLEN >= 32, so we'll operate on full vectors. Broadcast bswap_mask and
+ # gfpoly to both 128-bit lanes.
+ vinserti128 \$1, $BSWAP_MASK_XMM, $BSWAP_MASK, $BSWAP_MASK
+ vinserti128 \$1, $GFPOLY_XMM, $GFPOLY, $GFPOLY
+
cmp \$127, $AADLEN
jbe .Lghash_loop_1x
@@ -530,9 +540,6 @@
cmp \$32, $AADLEN
jae .Lghash_loop_1x
.Lghash_loop_1x_done:
- # Issue the vzeroupper that is needed after using ymm registers. Do it here
- # instead of at the end, to minimize overhead for small AADLEN.
- vzeroupper
# Update GHASH with the remaining 16-byte block if any.
.Lghash_lastblock:
@@ -549,6 +556,8 @@
# Store the updated GHASH accumulator back to memory.
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+ vzeroupper
___
}
$code .= _end_func;
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-apple.S b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
index a7ec87e..5409129 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-apple.S
+++ b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
@@ -308,9 +308,6 @@
L$aad_large_done__func1:
- vzeroupper
-
-
L$aad_blockbyblock__func1:
testq %rcx,%rcx
jz L$aad_done__func1
@@ -339,6 +336,8 @@
vpshufb %xmm4,%xmm5,%xmm5
vmovdqu %xmm5,(%rdi)
+
+ vzeroupper
ret
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-linux.S b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
index 0ffc7c7..c319b72 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-linux.S
+++ b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
@@ -308,9 +308,6 @@
.Laad_large_done__func1:
- vzeroupper
-
-
.Laad_blockbyblock__func1:
testq %rcx,%rcx
jz .Laad_done__func1
@@ -339,6 +336,8 @@
vpshufb %xmm4,%xmm5,%xmm5
vmovdqu %xmm5,(%rdi)
+
+ vzeroupper
ret
.cfi_endproc
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-win.asm b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
index 051a530..f2073e7 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-win.asm
+++ b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
@@ -337,9 +337,6 @@
$L$aad_large_done__func1:
- vzeroupper
-
-
$L$aad_blockbyblock__func1:
test r9,r9
jz NEAR $L$aad_done__func1
@@ -368,6 +365,8 @@
vpshufb xmm5,xmm5,xmm4
vmovdqu XMMWORD[rcx],xmm5
+
+ vzeroupper
movdqa xmm6,XMMWORD[rsp]
movdqa xmm7,XMMWORD[16+rsp]
movdqa xmm8,XMMWORD[32+rsp]
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-apple.S b/gen/bcm/aes-gcm-avx2-x86_64-apple.S
index d896f2a..431c816 100644
--- a/gen/bcm/aes-gcm-avx2-x86_64-apple.S
+++ b/gen/bcm/aes-gcm-avx2-x86_64-apple.S
@@ -202,15 +202,25 @@
- vbroadcasti128 L$bswap_mask(%rip),%ymm6
+
+
+
+ vmovdqu L$bswap_mask(%rip),%xmm6
+ vmovdqu L$gfpoly(%rip),%xmm7
+
+
vmovdqu (%rdi),%xmm5
vpshufb %xmm6,%xmm5,%xmm5
- vbroadcasti128 L$gfpoly(%rip),%ymm7
cmpq $32,%rcx
jb L$ghash_lastblock
+
+
+ vinserti128 $1,%xmm6,%ymm6,%ymm6
+ vinserti128 $1,%xmm7,%ymm7,%ymm7
+
cmpq $127,%rcx
jbe L$ghash_loop_1x
@@ -319,9 +329,6 @@
L$ghash_loop_1x_done:
- vzeroupper
-
-
L$ghash_lastblock:
testq %rcx,%rcx
jz L$ghash_done
@@ -348,6 +355,8 @@
vpshufb %xmm6,%xmm5,%xmm5
vmovdqu %xmm5,(%rdi)
+
+ vzeroupper
ret
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-linux.S b/gen/bcm/aes-gcm-avx2-x86_64-linux.S
index 583f02f..a27a804 100644
--- a/gen/bcm/aes-gcm-avx2-x86_64-linux.S
+++ b/gen/bcm/aes-gcm-avx2-x86_64-linux.S
@@ -202,15 +202,25 @@
- vbroadcasti128 .Lbswap_mask(%rip),%ymm6
+
+
+
+ vmovdqu .Lbswap_mask(%rip),%xmm6
+ vmovdqu .Lgfpoly(%rip),%xmm7
+
+
vmovdqu (%rdi),%xmm5
vpshufb %xmm6,%xmm5,%xmm5
- vbroadcasti128 .Lgfpoly(%rip),%ymm7
cmpq $32,%rcx
jb .Lghash_lastblock
+
+
+ vinserti128 $1,%xmm6,%ymm6,%ymm6
+ vinserti128 $1,%xmm7,%ymm7,%ymm7
+
cmpq $127,%rcx
jbe .Lghash_loop_1x
@@ -319,9 +329,6 @@
.Lghash_loop_1x_done:
- vzeroupper
-
-
.Lghash_lastblock:
testq %rcx,%rcx
jz .Lghash_done
@@ -348,6 +355,8 @@
vpshufb %xmm6,%xmm5,%xmm5
vmovdqu %xmm5,(%rdi)
+
+ vzeroupper
ret
.cfi_endproc
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-win.asm b/gen/bcm/aes-gcm-avx2-x86_64-win.asm
index 00e2a2b..aec14b3 100644
--- a/gen/bcm/aes-gcm-avx2-x86_64-win.asm
+++ b/gen/bcm/aes-gcm-avx2-x86_64-win.asm
@@ -229,15 +229,25 @@
$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_7:
- vbroadcasti128 ymm6,XMMWORD[$L$bswap_mask]
+
+
+
+ vmovdqu xmm6,XMMWORD[$L$bswap_mask]
+ vmovdqu xmm7,XMMWORD[$L$gfpoly]
+
+
vmovdqu xmm5,XMMWORD[rcx]
vpshufb xmm5,xmm5,xmm6
- vbroadcasti128 ymm7,XMMWORD[$L$gfpoly]
cmp r9,32
jb NEAR $L$ghash_lastblock
+
+
+ vinserti128 ymm6,ymm6,xmm6,1
+ vinserti128 ymm7,ymm7,xmm7,1
+
cmp r9,127
jbe NEAR $L$ghash_loop_1x
@@ -346,9 +356,6 @@
$L$ghash_loop_1x_done:
- vzeroupper
-
-
$L$ghash_lastblock:
test r9,r9
jz NEAR $L$ghash_done
@@ -375,6 +382,8 @@
vpshufb xmm5,xmm5,xmm6
vmovdqu XMMWORD[rcx],xmm5
+
+ vzeroupper
movdqa xmm6,XMMWORD[rsp]
movdqa xmm7,XMMWORD[16+rsp]
movdqa xmm8,XMMWORD[32+rsp]