Remove OPENSSL_ia32cap_P references from ChaCha20-Poly1305 assembly

CPU dispatch is now all moved out of assembly.

Fixed: 42290548
Change-Id: Icbe5053255d4aa76406b5303ba515ec38d42cb0d
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/70809
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/cipher_extra/aead_test.cc b/crypto/cipher_extra/aead_test.cc
index e16f5a2..9207ccc 100644
--- a/crypto/cipher_extra/aead_test.cc
+++ b/crypto/cipher_extra/aead_test.cc
@@ -825,15 +825,33 @@
   for (size_t len = 0; len <= 1024; len += 5) {
     SCOPED_TRACE(len);
     union chacha20_poly1305_open_data open_ctx = {};
+#if defined(OPENSSL_X86_64)
+    CHECK_ABI(chacha20_poly1305_open_nohw, buf.get(), buf.get(), len, buf.get(),
+              len % 128, &open_ctx);
+    if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) {
+      CHECK_ABI(chacha20_poly1305_open_avx2, buf.get(), buf.get(), len,
+                buf.get(), len % 128, &open_ctx);
+    }
+#else
     CHECK_ABI(chacha20_poly1305_open, buf.get(), buf.get(), len, buf.get(),
               len % 128, &open_ctx);
+#endif
   }
 
   for (size_t len = 0; len <= 1024; len += 5) {
     SCOPED_TRACE(len);
     union chacha20_poly1305_seal_data seal_ctx = {};
+#if defined(OPENSSL_X86_64)
+    CHECK_ABI(chacha20_poly1305_seal_nohw, buf.get(), buf.get(), len, buf.get(),
+              len % 128, &seal_ctx);
+    if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) {
+      CHECK_ABI(chacha20_poly1305_seal_avx2, buf.get(), buf.get(), len,
+                buf.get(), len % 128, &seal_ctx);
+    }
+#else
     CHECK_ABI(chacha20_poly1305_seal, buf.get(), buf.get(), len, buf.get(),
               len % 128, &seal_ctx);
+#endif
   }
 }
 #endif  // SUPPORTS_ABI_TEST
diff --git a/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl b/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl
index fb11760..9297052 100644
--- a/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl
+++ b/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl
@@ -37,13 +37,9 @@
 $avx = 2;
 
 $code.=<<___;
-.text
-.extern OPENSSL_ia32cap_P
-
-chacha20_poly1305_constants:
-
 .section .rodata
 .align 64
+chacha20_poly1305_constants:
 .Lchacha20_consts:
 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
@@ -444,10 +440,10 @@
 #                             union chacha20_poly1305_open_data *aead_data)
 #
 $code.="
-.globl chacha20_poly1305_open
-.type chacha20_poly1305_open,\@function,6
+.globl chacha20_poly1305_open_nohw
+.type chacha20_poly1305_open_nohw,\@function,6
 .align 64
-chacha20_poly1305_open:
+chacha20_poly1305_open_nohw:
 .cfi_startproc
     _CET_ENDBR
     push %rbp
@@ -485,13 +481,8 @@
 $code.="
     mov %rdx, $inl
     mov $adl, 0+$len_store
-    mov $inl, 8+$len_store\n";
-$code.="
-    mov OPENSSL_ia32cap_P+8(%rip), %eax
-    and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
-    xor \$`(1<<5) + (1<<8)`, %eax
-    jz chacha20_poly1305_open_avx2\n" if ($avx>1);
-$code.="
+    mov $inl, 8+$len_store
+
     cmp \$128, $inl
     jbe .Lopen_sse_128
     # For long buffers, prepare the poly key first
@@ -858,7 +849,7 @@
         movdqa $C2, $B2
         movdqa $D2, $C2
     jmp .Lopen_sse_128_xor_hash
-.size chacha20_poly1305_open, .-chacha20_poly1305_open
+.size chacha20_poly1305_open_nohw, .-chacha20_poly1305_open_nohw
 .cfi_endproc
 
 ################################################################################
@@ -867,10 +858,10 @@
 #                             size_t plaintext_len, const uint8_t *ad,
 #                             size_t ad_len,
 #                             union chacha20_poly1305_seal_data *data);
-.globl  chacha20_poly1305_seal
-.type chacha20_poly1305_seal,\@function,6
+.globl  chacha20_poly1305_seal_nohw
+.type chacha20_poly1305_seal_nohw,\@function,6
 .align 64
-chacha20_poly1305_seal:
+chacha20_poly1305_seal_nohw:
 .cfi_startproc
     _CET_ENDBR
     push %rbp
@@ -909,13 +900,8 @@
     addq %rdx, $inl
     mov $adl, 0+$len_store
     mov $inl, 8+$len_store
-    mov %rdx, $inl\n";
-$code.="
-    mov OPENSSL_ia32cap_P+8(%rip), %eax
-    and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
-    xor \$`(1<<5) + (1<<8)`, %eax
-    jz chacha20_poly1305_seal_avx2\n" if ($avx>1);
-$code.="
+    mov %rdx, $inl
+
     cmp \$128, $inl
     jbe .Lseal_sse_128
     # For longer buffers, prepare the poly key + some stream
@@ -1371,7 +1357,7 @@
     mov %r8, $itr2
     call poly_hash_ad_internal
     jmp .Lseal_sse_128_tail_xor
-.size chacha20_poly1305_seal, .-chacha20_poly1305_seal
+.size chacha20_poly1305_seal_nohw, .-chacha20_poly1305_seal_nohw
 .cfi_endproc\n";
 }
 
@@ -1643,21 +1629,49 @@
 
 $code.="
 ###############################################################################
-.type chacha20_poly1305_open_avx2,\@abi-omnipotent
+.globl chacha20_poly1305_open_avx2
+.type chacha20_poly1305_open_avx2,\@function,6
 .align 64
 chacha20_poly1305_open_avx2:
 .cfi_startproc
-
-# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here
+    _CET_ENDBR
+    push %rbp
 .cfi_push %rbp
+    push %rbx
 .cfi_push %rbx
+    push %r12
 .cfi_push %r12
+    push %r13
 .cfi_push %r13
+    push %r14
 .cfi_push %r14
+    push %r15
 .cfi_push %r15
+    # We write the calculated authenticator back to keyp at the end, so save
+    # the pointer on the stack too.
+    push $keyp
 .cfi_push $keyp
+    sub \$288 + $xmm_storage + 32, %rsp
 .cfi_adjust_cfa_offset 288 + 32
 
+    lea 32(%rsp), %rbp
+    and \$-32, %rbp\n";
+$code.="
+    movaps %xmm6,16*0+$xmm_store
+    movaps %xmm7,16*1+$xmm_store
+    movaps %xmm8,16*2+$xmm_store
+    movaps %xmm9,16*3+$xmm_store
+    movaps %xmm10,16*4+$xmm_store
+    movaps %xmm11,16*5+$xmm_store
+    movaps %xmm12,16*6+$xmm_store
+    movaps %xmm13,16*7+$xmm_store
+    movaps %xmm14,16*8+$xmm_store
+    movaps %xmm15,16*9+$xmm_store\n" if ($win64);
+$code.="
+    mov %rdx, $inl
+    mov $adl, 0+$len_store
+    mov $inl, 8+$len_store
+
     vzeroupper
     vmovdqa .Lchacha20_consts(%rip), $A0
     vbroadcasti128 0*16($keyp), $B0
@@ -2113,20 +2127,49 @@
 .cfi_endproc
 ###############################################################################
 ###############################################################################
-.type chacha20_poly1305_seal_avx2,\@abi-omnipotent
+.globl chacha20_poly1305_seal_avx2
+.type chacha20_poly1305_seal_avx2,\@function,6
 .align 64
 chacha20_poly1305_seal_avx2:
 .cfi_startproc
-
-# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here
+    _CET_ENDBR
+    push %rbp
 .cfi_push %rbp
+    push %rbx
 .cfi_push %rbx
+    push %r12
 .cfi_push %r12
+    push %r13
 .cfi_push %r13
+    push %r14
 .cfi_push %r14
-.cfi_push %r15
+    push %r15
+.cfi_push %r15   
+# We write the calculated authenticator back to keyp at the end, so save
+# the pointer on the stack too.
+    push $keyp
 .cfi_push $keyp
+    sub \$288 + $xmm_storage + 32, %rsp
 .cfi_adjust_cfa_offset 288 + 32
+    lea 32(%rsp), %rbp
+    and \$-32, %rbp\n";
+$code.="
+    movaps %xmm6,16*0+$xmm_store
+    movaps %xmm7,16*1+$xmm_store
+    movaps %xmm8,16*2+$xmm_store
+    movaps %xmm9,16*3+$xmm_store
+    movaps %xmm10,16*4+$xmm_store
+    movaps %xmm11,16*5+$xmm_store
+    movaps %xmm12,16*6+$xmm_store
+    movaps %xmm13,16*7+$xmm_store
+    movaps %xmm14,16*8+$xmm_store
+    movaps %xmm15,16*9+$xmm_store\n" if ($win64);
+$code.="
+    mov 56($keyp), $inl  # extra_in_len
+    addq %rdx, $inl
+    mov $adl, 0+$len_store
+    mov $inl, 8+$len_store
+    mov %rdx, $inl
 
     vzeroupper
     vmovdqa .Lchacha20_consts(%rip), $A0
diff --git a/crypto/cipher_extra/internal.h b/crypto/cipher_extra/internal.h
index 39ab950..32c0353 100644
--- a/crypto/cipher_extra/internal.h
+++ b/crypto/cipher_extra/internal.h
@@ -192,22 +192,65 @@
 // Additional input parameters are passed in |aead_data->in|. On exit, it will
 // write calculated tag value to |aead_data->out.tag|, which the caller must
 // check.
+#if defined(OPENSSL_X86_64)
+extern void chacha20_poly1305_open_nohw(
+    uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len,
+    const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data);
+extern void chacha20_poly1305_open_avx2(
+    uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len,
+    const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data);
+OPENSSL_INLINE void chacha20_poly1305_open(uint8_t *out_plaintext,
+                                   const uint8_t *ciphertext,
+                                   size_t plaintext_len, const uint8_t *ad,
+                                   size_t ad_len,
+                                   union chacha20_poly1305_open_data *data) {
+  if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) {
+    chacha20_poly1305_open_avx2(out_plaintext, ciphertext, plaintext_len, ad,
+                                ad_len, data);
+  } else {
+    chacha20_poly1305_open_nohw(out_plaintext, ciphertext, plaintext_len, ad,
+                                ad_len, data);
+  }
+}
+#else
 extern void chacha20_poly1305_open(uint8_t *out_plaintext,
                                    const uint8_t *ciphertext,
                                    size_t plaintext_len, const uint8_t *ad,
                                    size_t ad_len,
                                    union chacha20_poly1305_open_data *data);
+#endif
 
 // chacha20_poly1305_open is defined in chacha20_poly1305_*.pl. It encrypts
 // |plaintext_len| bytes from |plaintext| and writes them to |out_ciphertext|.
 // Additional input parameters are passed in |aead_data->in|. The calculated tag
 // value is over the computed ciphertext concatenated with |extra_ciphertext|
 // and written to |aead_data->out.tag|.
+#if defined(OPENSSL_X86_64)
+extern void chacha20_poly1305_seal_nohw(
+    uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len,
+    const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data);
+extern void chacha20_poly1305_seal_avx2(
+    uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len,
+    const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data);
+OPENSSL_INLINE void chacha20_poly1305_seal(
+    uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len,
+    const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data) {
+  if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) {
+    chacha20_poly1305_seal_avx2(out_ciphertext, plaintext, plaintext_len, ad,
+                                ad_len, data);
+  } else {
+    chacha20_poly1305_seal_nohw(out_ciphertext, plaintext, plaintext_len, ad,
+                                ad_len, data);
+  }
+}
+#else
 extern void chacha20_poly1305_seal(uint8_t *out_ciphertext,
                                    const uint8_t *plaintext,
                                    size_t plaintext_len, const uint8_t *ad,
                                    size_t ad_len,
                                    union chacha20_poly1305_seal_data *data);
+#endif
+
 #else
 
 OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) { return 0; }
diff --git a/gen/crypto/chacha20_poly1305_x86_64-apple.S b/gen/crypto/chacha20_poly1305_x86_64-apple.S
index e4a7202..5ae5849 100644
--- a/gen/crypto/chacha20_poly1305_x86_64-apple.S
+++ b/gen/crypto/chacha20_poly1305_x86_64-apple.S
@@ -4,13 +4,9 @@
 #include <openssl/asm_base.h>
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-chacha20_poly1305_constants:
-
 .section	__DATA,__const
 .p2align	6
+chacha20_poly1305_constants:
 L$chacha20_consts:
 .byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
 .byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
@@ -216,11 +212,11 @@
 
 
 
-.globl	_chacha20_poly1305_open
-.private_extern _chacha20_poly1305_open
+.globl	_chacha20_poly1305_open_nohw
+.private_extern _chacha20_poly1305_open_nohw
 
 .p2align	6
-_chacha20_poly1305_open:
+_chacha20_poly1305_open_nohw:
 
 _CET_ENDBR
 	pushq	%rbp
@@ -249,11 +245,6 @@
 	movq	%r8,0+0+32(%rbp)
 	movq	%rbx,8+0+32(%rbp)
 
-	movl	_OPENSSL_ia32cap_P+8(%rip),%eax
-	andl	$288,%eax
-	xorl	$288,%eax
-	jz	chacha20_poly1305_open_avx2
-
 	cmpq	$128,%rbx
 	jbe	L$open_sse_128
 
@@ -2088,11 +2079,11 @@
 
 
 
-.globl	_chacha20_poly1305_seal
-.private_extern _chacha20_poly1305_seal
+.globl	_chacha20_poly1305_seal_nohw
+.private_extern _chacha20_poly1305_seal_nohw
 
 .p2align	6
-_chacha20_poly1305_seal:
+_chacha20_poly1305_seal_nohw:
 
 _CET_ENDBR
 	pushq	%rbp
@@ -2122,11 +2113,6 @@
 	movq	%rbx,8+0+32(%rbp)
 	movq	%rdx,%rbx
 
-	movl	_OPENSSL_ia32cap_P+8(%rip),%eax
-	andl	$288,%eax
-	xorl	$288,%eax
-	jz	chacha20_poly1305_seal_avx2
-
 	cmpq	$128,%rbx
 	jbe	L$seal_sse_128
 
@@ -4075,20 +4061,38 @@
 
 
 
+.globl	_chacha20_poly1305_open_avx2
+.private_extern _chacha20_poly1305_open_avx2
 
 .p2align	6
-chacha20_poly1305_open_avx2:
+_chacha20_poly1305_open_avx2:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
 
 
 
+	pushq	%r9
+
+	subq	$288 + 0 + 32,%rsp
 
 
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
 
-
-
-
-
-
+	movq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
 
 	vzeroupper
 	vmovdqa	L$chacha20_consts(%rip),%ymm0
@@ -6223,20 +6227,39 @@
 
 
 
+.globl	_chacha20_poly1305_seal_avx2
+.private_extern _chacha20_poly1305_seal_avx2
 
 .p2align	6
-chacha20_poly1305_seal_avx2:
+_chacha20_poly1305_seal_avx2:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
 
 
 
+	pushq	%r9
 
+	subq	$288 + 0 + 32,%rsp
 
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
 
-
-
-
-
-
+	movq	56(%r9),%rbx
+	addq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+	movq	%rdx,%rbx
 
 	vzeroupper
 	vmovdqa	L$chacha20_consts(%rip),%ymm0
diff --git a/gen/crypto/chacha20_poly1305_x86_64-linux.S b/gen/crypto/chacha20_poly1305_x86_64-linux.S
index ac38f8f..22123ee 100644
--- a/gen/crypto/chacha20_poly1305_x86_64-linux.S
+++ b/gen/crypto/chacha20_poly1305_x86_64-linux.S
@@ -4,14 +4,9 @@
 #include <openssl/asm_base.h>
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
-.text	
-.extern	OPENSSL_ia32cap_P
-.hidden OPENSSL_ia32cap_P
-
-chacha20_poly1305_constants:
-
 .section	.rodata
 .align	64
+chacha20_poly1305_constants:
 .Lchacha20_consts:
 .byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
 .byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
@@ -217,11 +212,11 @@
 .cfi_endproc	
 .size	poly_hash_ad_internal, .-poly_hash_ad_internal
 
-.globl	chacha20_poly1305_open
-.hidden chacha20_poly1305_open
-.type	chacha20_poly1305_open,@function
+.globl	chacha20_poly1305_open_nohw
+.hidden chacha20_poly1305_open_nohw
+.type	chacha20_poly1305_open_nohw,@function
 .align	64
-chacha20_poly1305_open:
+chacha20_poly1305_open_nohw:
 .cfi_startproc	
 _CET_ENDBR
 	pushq	%rbp
@@ -257,11 +252,6 @@
 	movq	%r8,0+0+32(%rbp)
 	movq	%rbx,8+0+32(%rbp)
 
-	movl	OPENSSL_ia32cap_P+8(%rip),%eax
-	andl	$288,%eax
-	xorl	$288,%eax
-	jz	chacha20_poly1305_open_avx2
-
 	cmpq	$128,%rbx
 	jbe	.Lopen_sse_128
 
@@ -2094,7 +2084,7 @@
 	movdqa	%xmm10,%xmm6
 	movdqa	%xmm14,%xmm10
 	jmp	.Lopen_sse_128_xor_hash
-.size	chacha20_poly1305_open, .-chacha20_poly1305_open
+.size	chacha20_poly1305_open_nohw, .-chacha20_poly1305_open_nohw
 .cfi_endproc	
 
 
@@ -2103,11 +2093,11 @@
 
 
 
-.globl	chacha20_poly1305_seal
-.hidden chacha20_poly1305_seal
-.type	chacha20_poly1305_seal,@function
+.globl	chacha20_poly1305_seal_nohw
+.hidden chacha20_poly1305_seal_nohw
+.type	chacha20_poly1305_seal_nohw,@function
 .align	64
-chacha20_poly1305_seal:
+chacha20_poly1305_seal_nohw:
 .cfi_startproc	
 _CET_ENDBR
 	pushq	%rbp
@@ -2144,11 +2134,6 @@
 	movq	%rbx,8+0+32(%rbp)
 	movq	%rdx,%rbx
 
-	movl	OPENSSL_ia32cap_P+8(%rip),%eax
-	andl	$288,%eax
-	xorl	$288,%eax
-	jz	chacha20_poly1305_seal_avx2
-
 	cmpq	$128,%rbx
 	jbe	.Lseal_sse_128
 
@@ -4100,32 +4085,50 @@
 	movq	%r8,%r8
 	call	poly_hash_ad_internal
 	jmp	.Lseal_sse_128_tail_xor
-.size	chacha20_poly1305_seal, .-chacha20_poly1305_seal
+.size	chacha20_poly1305_seal_nohw, .-chacha20_poly1305_seal_nohw
 .cfi_endproc	
 
 
+.globl	chacha20_poly1305_open_avx2
+.hidden chacha20_poly1305_open_avx2
 .type	chacha20_poly1305_open_avx2,@function
 .align	64
 chacha20_poly1305_open_avx2:
 .cfi_startproc	
-
-
+_CET_ENDBR
+	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
+	pushq	%rbx
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbx,-24
+	pushq	%r12
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r12,-32
+	pushq	%r13
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r13,-40
+	pushq	%r14
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r14,-48
+	pushq	%r15
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r15,-56
+
+
+	pushq	%r9
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
 .cfi_adjust_cfa_offset	288 + 32
 
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+
 	vzeroupper
 	vmovdqa	.Lchacha20_consts(%rip),%ymm0
 	vbroadcasti128	0(%r9),%ymm4
@@ -6259,27 +6262,46 @@
 .cfi_endproc	
 
 
+.globl	chacha20_poly1305_seal_avx2
+.hidden chacha20_poly1305_seal_avx2
 .type	chacha20_poly1305_seal_avx2,@function
 .align	64
 chacha20_poly1305_seal_avx2:
 .cfi_startproc	
-
-
+_CET_ENDBR
+	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
+	pushq	%rbx
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbx,-24
+	pushq	%r12
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r12,-32
+	pushq	%r13
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r13,-40
+	pushq	%r14
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r14,-48
+	pushq	%r15
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r15,-56
+
+
+	pushq	%r9
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
 .cfi_adjust_cfa_offset	288 + 32
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	56(%r9),%rbx
+	addq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+	movq	%rdx,%rbx
 
 	vzeroupper
 	vmovdqa	.Lchacha20_consts(%rip),%ymm0
diff --git a/gen/crypto/chacha20_poly1305_x86_64-win.asm b/gen/crypto/chacha20_poly1305_x86_64-win.asm
index 095689c..22dbdfe 100644
--- a/gen/crypto/chacha20_poly1305_x86_64-win.asm
+++ b/gen/crypto/chacha20_poly1305_x86_64-win.asm
@@ -11,14 +11,9 @@
 %ifdef BORINGSSL_PREFIX
 %include "boringssl_prefix_symbols_nasm.inc"
 %endif
-section	.text code align=64
-
-EXTERN	OPENSSL_ia32cap_P
-
-chacha20_poly1305_constants:
-
 section	.rdata rdata align=8
 ALIGN	64
+chacha20_poly1305_constants:
 $L$chacha20_consts:
 	DB	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
 	DB	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
@@ -55,7 +50,7 @@
 	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
 	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
 	DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
-section	.text
+section	.text code align=64
 
 
 
@@ -225,14 +220,14 @@
 
 
 
-global	chacha20_poly1305_open
+global	chacha20_poly1305_open_nohw
 
 ALIGN	64
-chacha20_poly1305_open:
+chacha20_poly1305_open_nohw:
 	mov	QWORD[8+rsp],rdi	;WIN64 prologue
 	mov	QWORD[16+rsp],rsi
 	mov	rax,rsp
-$L$SEH_begin_chacha20_poly1305_open:
+$L$SEH_begin_chacha20_poly1305_open_nohw:
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
@@ -280,11 +275,6 @@
 	mov	QWORD[((0+160+32))+rbp],r8
 	mov	QWORD[((8+160+32))+rbp],rbx
 
-	mov	eax,DWORD[((OPENSSL_ia32cap_P+8))]
-	and	eax,288
-	xor	eax,288
-	jz	NEAR chacha20_poly1305_open_avx2
-
 	cmp	rbx,128
 	jbe	NEAR $L$open_sse_128
 
@@ -2123,7 +2113,7 @@
 	movdqa	xmm6,xmm10
 	movdqa	xmm10,xmm14
 	jmp	NEAR $L$open_sse_128_xor_hash
-$L$SEH_end_chacha20_poly1305_open:
+$L$SEH_end_chacha20_poly1305_open_nohw:
 
 
 
@@ -2132,14 +2122,14 @@
 
 
 
-global	chacha20_poly1305_seal
+global	chacha20_poly1305_seal_nohw
 
 ALIGN	64
-chacha20_poly1305_seal:
+chacha20_poly1305_seal_nohw:
 	mov	QWORD[8+rsp],rdi	;WIN64 prologue
 	mov	QWORD[16+rsp],rsi
 	mov	rax,rsp
-$L$SEH_begin_chacha20_poly1305_seal:
+$L$SEH_begin_chacha20_poly1305_seal_nohw:
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
@@ -2188,11 +2178,6 @@
 	mov	QWORD[((8+160+32))+rbp],rbx
 	mov	rbx,rdx
 
-	mov	eax,DWORD[((OPENSSL_ia32cap_P+8))]
-	and	eax,288
-	xor	eax,288
-	jz	NEAR chacha20_poly1305_seal_avx2
-
 	cmp	rbx,128
 	jbe	NEAR $L$seal_sse_128
 
@@ -4150,24 +4135,64 @@
 	mov	r8,r8
 	call	poly_hash_ad_internal
 	jmp	NEAR $L$seal_sse_128_tail_xor
-$L$SEH_end_chacha20_poly1305_seal:
+$L$SEH_end_chacha20_poly1305_seal_nohw:
 
 
 
+global	chacha20_poly1305_open_avx2
 
 ALIGN	64
 chacha20_poly1305_open_avx2:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_chacha20_poly1305_open_avx2:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
 
 
 
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
 
 
 
+	push	r9
+
+	sub	rsp,288 + 160 + 32
 
 
+	lea	rbp,[32+rsp]
+	and	rbp,-32
 
+	movaps	XMMWORD[(0+0)+rbp],xmm6
+	movaps	XMMWORD[(16+0)+rbp],xmm7
+	movaps	XMMWORD[(32+0)+rbp],xmm8
+	movaps	XMMWORD[(48+0)+rbp],xmm9
+	movaps	XMMWORD[(64+0)+rbp],xmm10
+	movaps	XMMWORD[(80+0)+rbp],xmm11
+	movaps	XMMWORD[(96+0)+rbp],xmm12
+	movaps	XMMWORD[(112+0)+rbp],xmm13
+	movaps	XMMWORD[(128+0)+rbp],xmm14
+	movaps	XMMWORD[(144+0)+rbp],xmm15
 
-
+	mov	rbx,rdx
+	mov	QWORD[((0+160+32))+rbp],r8
+	mov	QWORD[((8+160+32))+rbp],rbx
 
 	vzeroupper
 	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
@@ -6298,24 +6323,65 @@
 	vperm2i128	ymm2,ymm6,ymm2,0x13
 	vperm2i128	ymm6,ymm14,ymm10,0x13
 	jmp	NEAR $L$open_avx2_short
+$L$SEH_end_chacha20_poly1305_open_avx2:
 
 
 
-
+global	chacha20_poly1305_seal_avx2
 
 ALIGN	64
 chacha20_poly1305_seal_avx2:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_chacha20_poly1305_seal_avx2:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
 
 
 
+_CET_ENDBR
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
 
 
 
+	push	r9
 
+	sub	rsp,288 + 160 + 32
 
+	lea	rbp,[32+rsp]
+	and	rbp,-32
 
+	movaps	XMMWORD[(0+0)+rbp],xmm6
+	movaps	XMMWORD[(16+0)+rbp],xmm7
+	movaps	XMMWORD[(32+0)+rbp],xmm8
+	movaps	XMMWORD[(48+0)+rbp],xmm9
+	movaps	XMMWORD[(64+0)+rbp],xmm10
+	movaps	XMMWORD[(80+0)+rbp],xmm11
+	movaps	XMMWORD[(96+0)+rbp],xmm12
+	movaps	XMMWORD[(112+0)+rbp],xmm13
+	movaps	XMMWORD[(128+0)+rbp],xmm14
+	movaps	XMMWORD[(144+0)+rbp],xmm15
 
-
+	mov	rbx,QWORD[56+r9]
+	add	rbx,rdx
+	mov	QWORD[((0+160+32))+rbp],r8
+	mov	QWORD[((8+160+32))+rbp],rbx
+	mov	rbx,rdx
 
 	vzeroupper
 	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
@@ -8950,7 +9016,7 @@
 	vzeroupper
 	jmp	NEAR $L$seal_sse_tail_16
 
-
+$L$SEH_end_chacha20_poly1305_seal_avx2:
 %else
 ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
 ret