Fix missing vzeroupper in gcm_ghash_vpclmulqdq_avx2() for len=16 gcm_ghash_vpclmulqdq_avx2() executes vzeroupper only when len >= 32, as it was supposed to use only xmm registers for len == 16. But actually it was writing to two ymm registers unconditionally for $BSWAP_MASK and $GFPOLY. Therefore, there could be a slow-down in later code using legacy SSE instructions, in the (probably rare) case where gcm_ghash_vpclmulqdq_avx2() was called with len=16 *and* wasn't followed by a function that does vzeroupper, e.g. aes_gcm_enc_update_vaes_avx2(). (The Windows xmm register restore epilogue of gcm_ghash_vpclmulqdq_avx2() itself does use legacy SSE instructions, so probably was slowed down by this, but that's just 8 instructions.) Fix this by updating gcm_ghash_vpclmulqdq_avx2() to correctly use only xmm registers when len=16. This makes it match the similar code in aes-gcm-avx10-x86_64.pl, which does do it correctly, more closely. Also, make both functions just execute vzeroupper unconditionally, so that it won't be missed again. It's actually only 1 cycle on the CPUs this code runs on, and it no longer seems worth executing conditionally. Change-Id: I975cd5b526e5cdae1a567f4085c2484552bf6bea Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/77227 Reviewed-by: David Benjamin <davidben@google.com> Commit-Queue: David Benjamin <davidben@google.com>

commit: d8beaa340640f8742b81721cc756309f69ecd579 [log] [tgz]
author: Eric Biggers <ebiggers@google.com> Mon Mar 10 10:27:59 2025 -0700
committer: Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com> Mon Mar 10 15:04:25 2025 -0700
tree: 2648865cbedc9c56ef9e2b7bdeba4b02c9577eca
parent: b803ed04706c772335abce3fb8b2c1cc43ca2cd4 [diff]
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
index e8cf3be..2f35323 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl

@@ -810,9 +810,6 @@
     jae             .Laad_loop_1x$local_label_suffix
 
 .Laad_large_done$local_label_suffix:
-    # Issue the vzeroupper that is needed after using ymm or zmm registers.
-    # Do it here instead of at the end, to minimize overhead for small AADLEN.
-    vzeroupper
 
     # GHASH the remaining data 16 bytes at a time, using xmm registers only.
 .Laad_blockbyblock$local_label_suffix:
@@ -833,6 +830,8 @@
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper      # This is needed after using ymm or zmm registers.
 ___
     return $code;
 }

diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
index ebbd0dd..deec309 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl

@@ -490,15 +490,25 @@
     @{[ _save_xmmregs (6 .. 9) ]}
     .seh_endprologue
 
-    vbroadcasti128  .Lbswap_mask(%rip), $BSWAP_MASK
+    # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small,
+    # usually only 128-bit vectors will be used.  So as an optimization, don't
+    # broadcast these constants to both 128-bit lanes quite yet.
+    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK_XMM
+    vmovdqu         .Lgfpoly(%rip), $GFPOLY_XMM
+
+    # Load the GHASH accumulator.
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-    vbroadcasti128  .Lgfpoly(%rip), $GFPOLY
 
     # Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128.
     cmp             \$32, $AADLEN
     jb              .Lghash_lastblock
 
+    # AADLEN >= 32, so we'll operate on full vectors.  Broadcast bswap_mask and
+    # gfpoly to both 128-bit lanes.
+    vinserti128     \$1, $BSWAP_MASK_XMM, $BSWAP_MASK, $BSWAP_MASK
+    vinserti128     \$1, $GFPOLY_XMM, $GFPOLY, $GFPOLY
+
     cmp             \$127, $AADLEN
     jbe             .Lghash_loop_1x
 
@@ -530,9 +540,6 @@
     cmp             \$32, $AADLEN
     jae             .Lghash_loop_1x
 .Lghash_loop_1x_done:
-    # Issue the vzeroupper that is needed after using ymm registers.  Do it here
-    # instead of at the end, to minimize overhead for small AADLEN.
-    vzeroupper
 
     # Update GHASH with the remaining 16-byte block if any.
 .Lghash_lastblock:
@@ -549,6 +556,8 @@
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper
 ___
 }
 $code .= _end_func;

diff --git a/gen/bcm/aes-gcm-avx10-x86_64-apple.S b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
index a7ec87e..5409129 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-apple.S
+++ b/gen/bcm/aes-gcm-avx10-x86_64-apple.S

@@ -308,9 +308,6 @@
 L$aad_large_done__func1:
 
 
-	vzeroupper
-
-
 L$aad_blockbyblock__func1:
 	testq	%rcx,%rcx
 	jz	L$aad_done__func1
@@ -339,6 +336,8 @@
 
 	vpshufb	%xmm4,%xmm5,%xmm5
 	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
 	ret
 
 

diff --git a/gen/bcm/aes-gcm-avx10-x86_64-linux.S b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
index 0ffc7c7..c319b72 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-linux.S
+++ b/gen/bcm/aes-gcm-avx10-x86_64-linux.S

@@ -308,9 +308,6 @@
 .Laad_large_done__func1:
 
 
-	vzeroupper
-
-
 .Laad_blockbyblock__func1:
 	testq	%rcx,%rcx
 	jz	.Laad_done__func1
@@ -339,6 +336,8 @@
 
 	vpshufb	%xmm4,%xmm5,%xmm5
 	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
 	ret
 
 .cfi_endproc	

diff --git a/gen/bcm/aes-gcm-avx10-x86_64-win.asm b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
index 051a530..f2073e7 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-win.asm
+++ b/gen/bcm/aes-gcm-avx10-x86_64-win.asm

@@ -337,9 +337,6 @@
 $L$aad_large_done__func1:
 
 
-	vzeroupper
-
-
 $L$aad_blockbyblock__func1:
 	test	r9,r9
 	jz	NEAR $L$aad_done__func1
@@ -368,6 +365,8 @@
 
 	vpshufb	xmm5,xmm5,xmm4
 	vmovdqu	XMMWORD[rcx],xmm5
+
+	vzeroupper
 	movdqa	xmm6,XMMWORD[rsp]
 	movdqa	xmm7,XMMWORD[16+rsp]
 	movdqa	xmm8,XMMWORD[32+rsp]

diff --git a/gen/bcm/aes-gcm-avx2-x86_64-apple.S b/gen/bcm/aes-gcm-avx2-x86_64-apple.S
index d896f2a..431c816 100644
--- a/gen/bcm/aes-gcm-avx2-x86_64-apple.S
+++ b/gen/bcm/aes-gcm-avx2-x86_64-apple.S

@@ -202,15 +202,25 @@
 
 
 
-	vbroadcasti128	L$bswap_mask(%rip),%ymm6
+
+
+
+	vmovdqu	L$bswap_mask(%rip),%xmm6
+	vmovdqu	L$gfpoly(%rip),%xmm7
+
+
 	vmovdqu	(%rdi),%xmm5
 	vpshufb	%xmm6,%xmm5,%xmm5
-	vbroadcasti128	L$gfpoly(%rip),%ymm7
 
 
 	cmpq	$32,%rcx
 	jb	L$ghash_lastblock
 
+
+
+	vinserti128	$1,%xmm6,%ymm6,%ymm6
+	vinserti128	$1,%xmm7,%ymm7,%ymm7
+
 	cmpq	$127,%rcx
 	jbe	L$ghash_loop_1x
 
@@ -319,9 +329,6 @@
 L$ghash_loop_1x_done:
 
 
-	vzeroupper
-
-
 L$ghash_lastblock:
 	testq	%rcx,%rcx
 	jz	L$ghash_done
@@ -348,6 +355,8 @@
 
 	vpshufb	%xmm6,%xmm5,%xmm5
 	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
 	ret
 
 

diff --git a/gen/bcm/aes-gcm-avx2-x86_64-linux.S b/gen/bcm/aes-gcm-avx2-x86_64-linux.S
index 583f02f..a27a804 100644
--- a/gen/bcm/aes-gcm-avx2-x86_64-linux.S
+++ b/gen/bcm/aes-gcm-avx2-x86_64-linux.S

@@ -202,15 +202,25 @@
 
 
 
-	vbroadcasti128	.Lbswap_mask(%rip),%ymm6
+
+
+
+	vmovdqu	.Lbswap_mask(%rip),%xmm6
+	vmovdqu	.Lgfpoly(%rip),%xmm7
+
+
 	vmovdqu	(%rdi),%xmm5
 	vpshufb	%xmm6,%xmm5,%xmm5
-	vbroadcasti128	.Lgfpoly(%rip),%ymm7
 
 
 	cmpq	$32,%rcx
 	jb	.Lghash_lastblock
 
+
+
+	vinserti128	$1,%xmm6,%ymm6,%ymm6
+	vinserti128	$1,%xmm7,%ymm7,%ymm7
+
 	cmpq	$127,%rcx
 	jbe	.Lghash_loop_1x
 
@@ -319,9 +329,6 @@
 .Lghash_loop_1x_done:
 
 
-	vzeroupper
-
-
 .Lghash_lastblock:
 	testq	%rcx,%rcx
 	jz	.Lghash_done
@@ -348,6 +355,8 @@
 
 	vpshufb	%xmm6,%xmm5,%xmm5
 	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
 	ret
 
 .cfi_endproc	

diff --git a/gen/bcm/aes-gcm-avx2-x86_64-win.asm b/gen/bcm/aes-gcm-avx2-x86_64-win.asm
index 00e2a2b..aec14b3 100644
--- a/gen/bcm/aes-gcm-avx2-x86_64-win.asm
+++ b/gen/bcm/aes-gcm-avx2-x86_64-win.asm

@@ -229,15 +229,25 @@
 
 $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_7:
 
-	vbroadcasti128	ymm6,XMMWORD[$L$bswap_mask]
+
+
+
+	vmovdqu	xmm6,XMMWORD[$L$bswap_mask]
+	vmovdqu	xmm7,XMMWORD[$L$gfpoly]
+
+
 	vmovdqu	xmm5,XMMWORD[rcx]
 	vpshufb	xmm5,xmm5,xmm6
-	vbroadcasti128	ymm7,XMMWORD[$L$gfpoly]
 
 
 	cmp	r9,32
 	jb	NEAR $L$ghash_lastblock
 
+
+
+	vinserti128	ymm6,ymm6,xmm6,1
+	vinserti128	ymm7,ymm7,xmm7,1
+
 	cmp	r9,127
 	jbe	NEAR $L$ghash_loop_1x
 
@@ -346,9 +356,6 @@
 $L$ghash_loop_1x_done:
 
 
-	vzeroupper
-
-
 $L$ghash_lastblock:
 	test	r9,r9
 	jz	NEAR $L$ghash_done
@@ -375,6 +382,8 @@
 
 	vpshufb	xmm5,xmm5,xmm6
 	vmovdqu	XMMWORD[rcx],xmm5
+
+	vzeroupper
 	movdqa	xmm6,XMMWORD[rsp]
 	movdqa	xmm7,XMMWORD[16+rsp]
 	movdqa	xmm8,XMMWORD[32+rsp]
commit	d8beaa340640f8742b81721cc756309f69ecd579	[log] [tgz]
author	Eric Biggers <ebiggers@google.com>	Mon Mar 10 10:27:59 2025 -0700
committer	Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com>	Mon Mar 10 15:04:25 2025 -0700
tree	2648865cbedc9c56ef9e2b7bdeba4b02c9577eca
parent	b803ed04706c772335abce3fb8b2c1cc43ca2cd4 [diff]