Fix missing vzeroupper in gcm_ghash_vpclmulqdq_avx2() for len=16

gcm_ghash_vpclmulqdq_avx2() executes vzeroupper only when len >= 32, as
it was supposed to use only xmm registers for len == 16.  But actually
it was writing to two ymm registers unconditionally for $BSWAP_MASK and
$GFPOLY.  Therefore, there could be a slow-down in later code using
legacy SSE instructions, in the (probably rare) case where
gcm_ghash_vpclmulqdq_avx2() was called with len=16 *and* wasn't followed
by a function that does vzeroupper, e.g. aes_gcm_enc_update_vaes_avx2().

(The Windows xmm register restore epilogue of
gcm_ghash_vpclmulqdq_avx2() itself does use legacy SSE instructions, so
probably was slowed down by this, but that's just 8 instructions.)

Fix this by updating gcm_ghash_vpclmulqdq_avx2() to correctly use only
xmm registers when len=16.  This makes it match the similar code in
aes-gcm-avx10-x86_64.pl, which does do it correctly, more closely.

Also, make both functions just execute vzeroupper unconditionally, so
that it won't be missed again.  It's actually only 1 cycle on the CPUs
this code runs on, and it no longer seems worth executing conditionally.

Change-Id: I975cd5b526e5cdae1a567f4085c2484552bf6bea
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/77227
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
index e8cf3be..2f35323 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
@@ -810,9 +810,6 @@
     jae             .Laad_loop_1x$local_label_suffix
 
 .Laad_large_done$local_label_suffix:
-    # Issue the vzeroupper that is needed after using ymm or zmm registers.
-    # Do it here instead of at the end, to minimize overhead for small AADLEN.
-    vzeroupper
 
     # GHASH the remaining data 16 bytes at a time, using xmm registers only.
 .Laad_blockbyblock$local_label_suffix:
@@ -833,6 +830,8 @@
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper      # This is needed after using ymm or zmm registers.
 ___
     return $code;
 }
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
index ebbd0dd..deec309 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
@@ -490,15 +490,25 @@
     @{[ _save_xmmregs (6 .. 9) ]}
     .seh_endprologue
 
-    vbroadcasti128  .Lbswap_mask(%rip), $BSWAP_MASK
+    # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small,
+    # usually only 128-bit vectors will be used.  So as an optimization, don't
+    # broadcast these constants to both 128-bit lanes quite yet.
+    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK_XMM
+    vmovdqu         .Lgfpoly(%rip), $GFPOLY_XMM
+
+    # Load the GHASH accumulator.
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-    vbroadcasti128  .Lgfpoly(%rip), $GFPOLY
 
     # Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128.
     cmp             \$32, $AADLEN
     jb              .Lghash_lastblock
 
+    # AADLEN >= 32, so we'll operate on full vectors.  Broadcast bswap_mask and
+    # gfpoly to both 128-bit lanes.
+    vinserti128     \$1, $BSWAP_MASK_XMM, $BSWAP_MASK, $BSWAP_MASK
+    vinserti128     \$1, $GFPOLY_XMM, $GFPOLY, $GFPOLY
+
     cmp             \$127, $AADLEN
     jbe             .Lghash_loop_1x
 
@@ -530,9 +540,6 @@
     cmp             \$32, $AADLEN
     jae             .Lghash_loop_1x
 .Lghash_loop_1x_done:
-    # Issue the vzeroupper that is needed after using ymm registers.  Do it here
-    # instead of at the end, to minimize overhead for small AADLEN.
-    vzeroupper
 
     # Update GHASH with the remaining 16-byte block if any.
 .Lghash_lastblock:
@@ -549,6 +556,8 @@
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper
 ___
 }
 $code .= _end_func;
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-apple.S b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
index a7ec87e..5409129 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-apple.S
+++ b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
@@ -308,9 +308,6 @@
 L$aad_large_done__func1:
 
 
-	vzeroupper
-
-
 L$aad_blockbyblock__func1:
 	testq	%rcx,%rcx
 	jz	L$aad_done__func1
@@ -339,6 +336,8 @@
 
 	vpshufb	%xmm4,%xmm5,%xmm5
 	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
 	ret
 
 
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-linux.S b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
index 0ffc7c7..c319b72 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-linux.S
+++ b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
@@ -308,9 +308,6 @@
 .Laad_large_done__func1:
 
 
-	vzeroupper
-
-
 .Laad_blockbyblock__func1:
 	testq	%rcx,%rcx
 	jz	.Laad_done__func1
@@ -339,6 +336,8 @@
 
 	vpshufb	%xmm4,%xmm5,%xmm5
 	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
 	ret
 
 .cfi_endproc	
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-win.asm b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
index 051a530..f2073e7 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-win.asm
+++ b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
@@ -337,9 +337,6 @@
 $L$aad_large_done__func1:
 
 
-	vzeroupper
-
-
 $L$aad_blockbyblock__func1:
 	test	r9,r9
 	jz	NEAR $L$aad_done__func1
@@ -368,6 +365,8 @@
 
 	vpshufb	xmm5,xmm5,xmm4
 	vmovdqu	XMMWORD[rcx],xmm5
+
+	vzeroupper
 	movdqa	xmm6,XMMWORD[rsp]
 	movdqa	xmm7,XMMWORD[16+rsp]
 	movdqa	xmm8,XMMWORD[32+rsp]
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-apple.S b/gen/bcm/aes-gcm-avx2-x86_64-apple.S
index d896f2a..431c816 100644
--- a/gen/bcm/aes-gcm-avx2-x86_64-apple.S
+++ b/gen/bcm/aes-gcm-avx2-x86_64-apple.S
@@ -202,15 +202,25 @@
 
 
 
-	vbroadcasti128	L$bswap_mask(%rip),%ymm6
+
+
+
+	vmovdqu	L$bswap_mask(%rip),%xmm6
+	vmovdqu	L$gfpoly(%rip),%xmm7
+
+
 	vmovdqu	(%rdi),%xmm5
 	vpshufb	%xmm6,%xmm5,%xmm5
-	vbroadcasti128	L$gfpoly(%rip),%ymm7
 
 
 	cmpq	$32,%rcx
 	jb	L$ghash_lastblock
 
+
+
+	vinserti128	$1,%xmm6,%ymm6,%ymm6
+	vinserti128	$1,%xmm7,%ymm7,%ymm7
+
 	cmpq	$127,%rcx
 	jbe	L$ghash_loop_1x
 
@@ -319,9 +329,6 @@
 L$ghash_loop_1x_done:
 
 
-	vzeroupper
-
-
 L$ghash_lastblock:
 	testq	%rcx,%rcx
 	jz	L$ghash_done
@@ -348,6 +355,8 @@
 
 	vpshufb	%xmm6,%xmm5,%xmm5
 	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
 	ret
 
 
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-linux.S b/gen/bcm/aes-gcm-avx2-x86_64-linux.S
index 583f02f..a27a804 100644
--- a/gen/bcm/aes-gcm-avx2-x86_64-linux.S
+++ b/gen/bcm/aes-gcm-avx2-x86_64-linux.S
@@ -202,15 +202,25 @@
 
 
 
-	vbroadcasti128	.Lbswap_mask(%rip),%ymm6
+
+
+
+	vmovdqu	.Lbswap_mask(%rip),%xmm6
+	vmovdqu	.Lgfpoly(%rip),%xmm7
+
+
 	vmovdqu	(%rdi),%xmm5
 	vpshufb	%xmm6,%xmm5,%xmm5
-	vbroadcasti128	.Lgfpoly(%rip),%ymm7
 
 
 	cmpq	$32,%rcx
 	jb	.Lghash_lastblock
 
+
+
+	vinserti128	$1,%xmm6,%ymm6,%ymm6
+	vinserti128	$1,%xmm7,%ymm7,%ymm7
+
 	cmpq	$127,%rcx
 	jbe	.Lghash_loop_1x
 
@@ -319,9 +329,6 @@
 .Lghash_loop_1x_done:
 
 
-	vzeroupper
-
-
 .Lghash_lastblock:
 	testq	%rcx,%rcx
 	jz	.Lghash_done
@@ -348,6 +355,8 @@
 
 	vpshufb	%xmm6,%xmm5,%xmm5
 	vmovdqu	%xmm5,(%rdi)
+
+	vzeroupper
 	ret
 
 .cfi_endproc	
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-win.asm b/gen/bcm/aes-gcm-avx2-x86_64-win.asm
index 00e2a2b..aec14b3 100644
--- a/gen/bcm/aes-gcm-avx2-x86_64-win.asm
+++ b/gen/bcm/aes-gcm-avx2-x86_64-win.asm
@@ -229,15 +229,25 @@
 
 $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_7:
 
-	vbroadcasti128	ymm6,XMMWORD[$L$bswap_mask]
+
+
+
+	vmovdqu	xmm6,XMMWORD[$L$bswap_mask]
+	vmovdqu	xmm7,XMMWORD[$L$gfpoly]
+
+
 	vmovdqu	xmm5,XMMWORD[rcx]
 	vpshufb	xmm5,xmm5,xmm6
-	vbroadcasti128	ymm7,XMMWORD[$L$gfpoly]
 
 
 	cmp	r9,32
 	jb	NEAR $L$ghash_lastblock
 
+
+
+	vinserti128	ymm6,ymm6,xmm6,1
+	vinserti128	ymm7,ymm7,xmm7,1
+
 	cmp	r9,127
 	jbe	NEAR $L$ghash_loop_1x
 
@@ -346,9 +356,6 @@
 $L$ghash_loop_1x_done:
 
 
-	vzeroupper
-
-
 $L$ghash_lastblock:
 	test	r9,r9
 	jz	NEAR $L$ghash_done
@@ -375,6 +382,8 @@
 
 	vpshufb	xmm5,xmm5,xmm6
 	vmovdqu	XMMWORD[rcx],xmm5
+
+	vzeroupper
 	movdqa	xmm6,XMMWORD[rsp]
 	movdqa	xmm7,XMMWORD[16+rsp]
 	movdqa	xmm8,XMMWORD[32+rsp]