Extract the AES-NI encrypt -> decrypt assembly conversion

aes_hw_set_decrypt_key calls aes_hw_set_encrypt_key and then does a
conversion, all in assembly. On x86(_64), aes_hw_set_encrypt_key
internally checks OPENSSL_ia32cap_P to call one of two variants.

In preparation for splitting those variants into separate functions, get
the in-asm function call out o f the day by extracting an
aes_hw_encrypt_key_to_decrypt_key function.

Bug: 673
Change-Id: I23eefc00bdc8cb1f20e17fb6716974e91f1c32c4
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/68689
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/aes.c b/crypto/fipsmodule/aes/aes.c
index 60f3545..56dfbe2 100644
--- a/crypto/fipsmodule/aes/aes.c
+++ b/crypto/fipsmodule/aes/aes.c
@@ -104,3 +104,16 @@
     return aes_nohw_set_decrypt_key(key, bits, aeskey);
   }
 }
+
+#if defined(HWAES) && (defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
+// On x86 and x86_64, |aes_hw_set_decrypt_key|, we implement
+// |aes_hw_encrypt_key_to_decrypt_key| in assembly and rely on C code to combine
+// the operations.
+int aes_hw_set_decrypt_key(const uint8_t *user_key, int bits, AES_KEY *key) {
+  int ret = aes_hw_set_encrypt_key(user_key, bits, key);
+  if (ret == 0) {
+    aes_hw_encrypt_key_to_decrypt_key(key);
+  }
+  return ret;
+}
+#endif
diff --git a/crypto/fipsmodule/aes/aes_test.cc b/crypto/fipsmodule/aes/aes_test.cc
index d4a458b..dc90067 100644
--- a/crypto/fipsmodule/aes/aes_test.cc
+++ b/crypto/fipsmodule/aes/aes_test.cc
@@ -346,7 +346,12 @@
 #endif
       }
 
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+      ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_encrypt_key, kKey, bits, &key), 0);
+      CHECK_ABI_SEH(aes_hw_encrypt_key_to_decrypt_key, &key);
+#else
       ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_decrypt_key, kKey, bits, &key), 0);
+#endif
       CHECK_ABI(aes_hw_decrypt, block, block, &key);
       for (size_t blocks : block_counts) {
         SCOPED_TRACE(blocks);
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86.pl b/crypto/fipsmodule/aes/asm/aesni-x86.pl
index bb5ee94..d8fdfb8 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86.pl
@@ -2490,17 +2490,11 @@
 	&ret	();
 &function_end_B("${PREFIX}_set_encrypt_key");
 
-# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
-#                              AES_KEY *key)
-&function_begin_B("${PREFIX}_set_decrypt_key");
-	&mov	("eax",&wparam(0));
-	&mov	($rounds,&wparam(1));
-	&mov	($key,&wparam(2));
-	&call	("_aesni_set_encrypt_key");
-	&mov	($key,&wparam(2));
-	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
-	&test	("eax","eax");
-	&jnz	(&label("dec_key_ret"));
+# void $PREFIX_encrypt_key_to_decrypt_key (AES_KEY *key)
+&function_begin_B("${PREFIX}_encrypt_key_to_decrypt_key");
+	&mov	($key,&wparam(0));
+	&mov	($rounds,&DWP(240,$key));
+	&shl	($rounds,4);
 	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
 
 	&$movekey	("xmm0",&QWP(0,$key));	# just swap
@@ -2528,10 +2522,8 @@
 
 	&pxor		("xmm0","xmm0");
 	&pxor		("xmm1","xmm1");
-	&xor		("eax","eax");		# return success
-&set_label("dec_key_ret");
 	&ret	();
-&function_end_B("${PREFIX}_set_decrypt_key");
+&function_end_B("${PREFIX}_encrypt_key_to_decrypt_key");
 
 &set_label("key_const",64);
 &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 17ec466..8b6036e 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -3172,69 +3172,55 @@
 .size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
 ___
 }
-# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
-#				int bits, AES_KEY *key)
-#
-# input:	$inp	user-supplied key
-#		$bits	$inp length in bits
-#		$key	pointer to key schedule
-# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
-#		*$key	key schedule
-#
-{ my ($inp,$bits,$key) = @_4args;
-  $bits =~ s/%r/%e/;
+{ my ($key, $rounds, $tmp) = @_4args;
+  $rounds =~ s/%r/%e/;
 
+# void ${PREFIX}_encrypt_key_to_decrypt_key(AES_KEY *key)
 $code.=<<___;
-.globl	${PREFIX}_set_decrypt_key
-.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
+.globl	${PREFIX}_encrypt_key_to_decrypt_key
+.type	${PREFIX}_encrypt_key_to_decrypt_key,\@abi-omnipotent
 .align	16
-${PREFIX}_set_decrypt_key:
+${PREFIX}_encrypt_key_to_decrypt_key:
 .cfi_startproc
-.seh_startproc
 	_CET_ENDBR
-	sub	\$8,%rsp
-.cfi_adjust_cfa_offset	8
-.seh_stackalloc	8
-.seh_endprologue
-	call	__aesni_set_encrypt_key
-	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
-	test	%eax,%eax
-	jnz	.Ldec_key_ret
-	lea	16($key,$bits),$inp	# points at the end of key schedule
+
+	mov	240($key), $rounds
+	shl	\$4,$rounds
+
+	lea	16($key,$rounds),$tmp	# points at the end of key schedule
 
 	$movkey	($key),%xmm0		# just swap
-	$movkey	($inp),%xmm1
-	$movkey	%xmm0,($inp)
+	$movkey	($tmp),%xmm1
+	$movkey	%xmm0,($tmp)
 	$movkey	%xmm1,($key)
 	lea	16($key),$key
-	lea	-16($inp),$inp
+	lea	-16($tmp),$tmp
 
 .Ldec_key_inverse:
 	$movkey	($key),%xmm0		# swap and inverse
-	$movkey	($inp),%xmm1
+	$movkey	($tmp),%xmm1
 	aesimc	%xmm0,%xmm0
 	aesimc	%xmm1,%xmm1
 	lea	16($key),$key
-	lea	-16($inp),$inp
-	$movkey	%xmm0,16($inp)
+	lea	-16($tmp),$tmp
+	$movkey	%xmm0,16($tmp)
 	$movkey	%xmm1,-16($key)
-	cmp	$key,$inp
+	cmp	$key,$tmp
 	ja	.Ldec_key_inverse
 
 	$movkey	($key),%xmm0		# inverse middle
 	aesimc	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
-	$movkey	%xmm0,($inp)
+	$movkey	%xmm0,($tmp)
 	pxor	%xmm0,%xmm0
-.Ldec_key_ret:
-	add	\$8,%rsp
-.cfi_adjust_cfa_offset	-8
 	ret
 .cfi_endproc
-.seh_endproc
-.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+.size	${PREFIX}_encrypt_key_to_decrypt_key,.-${PREFIX}_encrypt_key_to_decrypt_key
 ___
+}
 
+{ my ($inp,$bits,$key) = @_4args;
+  $bits =~ s/%r/%e/;
 # This is based on submission from Intel by
 #	Huang Ying
 #	Vinodh Gopal
@@ -3264,7 +3250,6 @@
 .type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
 .align	16
 ${PREFIX}_set_encrypt_key:
-__aesni_set_encrypt_key:
 .cfi_startproc
 .seh_startproc
 	_CET_ENDBR
@@ -3636,7 +3621,6 @@
 	xorps	%xmm1,%xmm2
 	ret
 .size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
-.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
 ___
 }
 
diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h
index 98b2a14d..e7f55d2 100644
--- a/crypto/fipsmodule/aes/internal.h
+++ b/crypto/fipsmodule/aes/internal.h
@@ -66,17 +66,21 @@
 
 #if defined(HWAES)
 
-int aes_hw_set_encrypt_key(const uint8_t *user_key, const int bits,
-                           AES_KEY *key);
-int aes_hw_set_decrypt_key(const uint8_t *user_key, const int bits,
-                           AES_KEY *key);
+int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, AES_KEY *key);
+int aes_hw_set_decrypt_key(const uint8_t *user_key, int bits, AES_KEY *key);
 void aes_hw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
 void aes_hw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
 void aes_hw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
-                        const AES_KEY *key, uint8_t *ivec, const int enc);
+                        const AES_KEY *key, uint8_t *ivec, int enc);
 void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
                                  const AES_KEY *key, const uint8_t ivec[16]);
 
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+// On x86 and x86_64, |aes_hw_set_decrypt_key| is implemented in terms of
+// |aes_hw_set_encrypt_key| and a conversion function.
+void aes_hw_encrypt_key_to_decrypt_key(AES_KEY *key);
+#endif
+
 #else
 
 // If HWAES isn't defined then we provide dummy functions for each of the hwaes
@@ -120,7 +124,7 @@
 
 #if defined(HWAES_ECB)
 void aes_hw_ecb_encrypt(const uint8_t *in, uint8_t *out, size_t length,
-                        const AES_KEY *key, const int enc);
+                        const AES_KEY *key, int enc);
 #endif  // HWAES_ECB
 
 
@@ -218,7 +222,7 @@
                                    size_t blocks, const AES_KEY *key,
                                    const uint8_t ivec[16]);
 void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len,
-                          const AES_KEY *key, uint8_t *ivec, const int enc);
+                          const AES_KEY *key, uint8_t *ivec, int enc);
 
 
 #if defined(__cplusplus)
diff --git a/gen/bcm/aesni-x86-apple.S b/gen/bcm/aesni-x86-apple.S
index cee5724..e64b4bb 100644
--- a/gen/bcm/aesni-x86-apple.S
+++ b/gen/bcm/aesni-x86-apple.S
@@ -2409,19 +2409,14 @@
 	movl	12(%esp),%edx
 	call	__aesni_set_encrypt_key
 	ret
-.globl	_aes_hw_set_decrypt_key
-.private_extern	_aes_hw_set_decrypt_key
+.globl	_aes_hw_encrypt_key_to_decrypt_key
+.private_extern	_aes_hw_encrypt_key_to_decrypt_key
 .align	4
-_aes_hw_set_decrypt_key:
-L_aes_hw_set_decrypt_key_begin:
-	movl	4(%esp),%eax
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	call	__aesni_set_encrypt_key
-	movl	12(%esp),%edx
+_aes_hw_encrypt_key_to_decrypt_key:
+L_aes_hw_encrypt_key_to_decrypt_key_begin:
+	movl	4(%esp),%edx
+	movl	240(%edx),%ecx
 	shll	$4,%ecx
-	testl	%eax,%eax
-	jnz	L116dec_key_ret
 	leal	16(%edx,%ecx,1),%eax
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
@@ -2429,7 +2424,7 @@
 	movups	%xmm1,(%edx)
 	leal	16(%edx),%edx
 	leal	-16(%eax),%eax
-L117dec_key_inverse:
+L116dec_key_inverse:
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
 .byte	102,15,56,219,192
@@ -2439,14 +2434,12 @@
 	movups	%xmm0,16(%eax)
 	movups	%xmm1,-16(%edx)
 	cmpl	%edx,%eax
-	ja	L117dec_key_inverse
+	ja	L116dec_key_inverse
 	movups	(%edx),%xmm0
 .byte	102,15,56,219,192
 	movups	%xmm0,(%edx)
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
-	xorl	%eax,%eax
-L116dec_key_ret:
 	ret
 .align	6,0x90
 Lkey_const:
diff --git a/gen/bcm/aesni-x86-linux.S b/gen/bcm/aesni-x86-linux.S
index dbcded2..1f15c71 100644
--- a/gen/bcm/aesni-x86-linux.S
+++ b/gen/bcm/aesni-x86-linux.S
@@ -2447,20 +2447,15 @@
 	call	_aesni_set_encrypt_key
 	ret
 .size	aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin
-.globl	aes_hw_set_decrypt_key
-.hidden	aes_hw_set_decrypt_key
-.type	aes_hw_set_decrypt_key,@function
+.globl	aes_hw_encrypt_key_to_decrypt_key
+.hidden	aes_hw_encrypt_key_to_decrypt_key
+.type	aes_hw_encrypt_key_to_decrypt_key,@function
 .align	16
-aes_hw_set_decrypt_key:
-.L_aes_hw_set_decrypt_key_begin:
-	movl	4(%esp),%eax
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	call	_aesni_set_encrypt_key
-	movl	12(%esp),%edx
+aes_hw_encrypt_key_to_decrypt_key:
+.L_aes_hw_encrypt_key_to_decrypt_key_begin:
+	movl	4(%esp),%edx
+	movl	240(%edx),%ecx
 	shll	$4,%ecx
-	testl	%eax,%eax
-	jnz	.L116dec_key_ret
 	leal	16(%edx,%ecx,1),%eax
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
@@ -2468,7 +2463,7 @@
 	movups	%xmm1,(%edx)
 	leal	16(%edx),%edx
 	leal	-16(%eax),%eax
-.L117dec_key_inverse:
+.L116dec_key_inverse:
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
 .byte	102,15,56,219,192
@@ -2478,16 +2473,14 @@
 	movups	%xmm0,16(%eax)
 	movups	%xmm1,-16(%edx)
 	cmpl	%edx,%eax
-	ja	.L117dec_key_inverse
+	ja	.L116dec_key_inverse
 	movups	(%edx),%xmm0
 .byte	102,15,56,219,192
 	movups	%xmm0,(%edx)
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
-	xorl	%eax,%eax
-.L116dec_key_ret:
 	ret
-.size	aes_hw_set_decrypt_key,.-.L_aes_hw_set_decrypt_key_begin
+.size	aes_hw_encrypt_key_to_decrypt_key,.-.L_aes_hw_encrypt_key_to_decrypt_key_begin
 .align	64
 .Lkey_const:
 .long	202313229,202313229,202313229,202313229
diff --git a/gen/bcm/aesni-x86-win.asm b/gen/bcm/aesni-x86-win.asm
index ce95d23..2d32e77 100644
--- a/gen/bcm/aesni-x86-win.asm
+++ b/gen/bcm/aesni-x86-win.asm
@@ -2400,18 +2400,13 @@
 	mov	edx,DWORD [12+esp]
 	call	__aesni_set_encrypt_key
 	ret
-global	_aes_hw_set_decrypt_key
+global	_aes_hw_encrypt_key_to_decrypt_key
 align	16
-_aes_hw_set_decrypt_key:
-L$_aes_hw_set_decrypt_key_begin:
-	mov	eax,DWORD [4+esp]
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	call	__aesni_set_encrypt_key
-	mov	edx,DWORD [12+esp]
+_aes_hw_encrypt_key_to_decrypt_key:
+L$_aes_hw_encrypt_key_to_decrypt_key_begin:
+	mov	edx,DWORD [4+esp]
+	mov	ecx,DWORD [240+edx]
 	shl	ecx,4
-	test	eax,eax
-	jnz	NEAR L$116dec_key_ret
 	lea	eax,[16+ecx*1+edx]
 	movups	xmm0,[edx]
 	movups	xmm1,[eax]
@@ -2419,7 +2414,7 @@
 	movups	[edx],xmm1
 	lea	edx,[16+edx]
 	lea	eax,[eax-16]
-L$117dec_key_inverse:
+L$116dec_key_inverse:
 	movups	xmm0,[edx]
 	movups	xmm1,[eax]
 db	102,15,56,219,192
@@ -2429,14 +2424,12 @@
 	movups	[16+eax],xmm0
 	movups	[edx-16],xmm1
 	cmp	eax,edx
-	ja	NEAR L$117dec_key_inverse
+	ja	NEAR L$116dec_key_inverse
 	movups	xmm0,[edx]
 db	102,15,56,219,192
 	movups	[edx],xmm0
 	pxor	xmm0,xmm0
 	pxor	xmm1,xmm1
-	xor	eax,eax
-L$116dec_key_ret:
 	ret
 align	64
 L$key_const:
diff --git a/gen/bcm/aesni-x86_64-apple.S b/gen/bcm/aesni-x86_64-apple.S
index 48d3cfc..ccf9f8f 100644
--- a/gen/bcm/aesni-x86_64-apple.S
+++ b/gen/bcm/aesni-x86_64-apple.S
@@ -1905,61 +1905,51 @@
 	ret
 
 
-.globl	_aes_hw_set_decrypt_key
-.private_extern _aes_hw_set_decrypt_key
+.globl	_aes_hw_encrypt_key_to_decrypt_key
+.private_extern _aes_hw_encrypt_key_to_decrypt_key
 
 .p2align	4
-_aes_hw_set_decrypt_key:
-
+_aes_hw_encrypt_key_to_decrypt_key:
 
 _CET_ENDBR
-	subq	$8,%rsp
 
-
-
-	call	__aesni_set_encrypt_key
+	movl	240(%rdi),%esi
 	shll	$4,%esi
-	testl	%eax,%eax
-	jnz	L$dec_key_ret
-	leaq	16(%rdx,%rsi,1),%rdi
 
-	movups	(%rdx),%xmm0
-	movups	(%rdi),%xmm1
-	movups	%xmm0,(%rdi)
-	movups	%xmm1,(%rdx)
-	leaq	16(%rdx),%rdx
-	leaq	-16(%rdi),%rdi
+	leaq	16(%rdi,%rsi,1),%rdx
+
+	movups	(%rdi),%xmm0
+	movups	(%rdx),%xmm1
+	movups	%xmm0,(%rdx)
+	movups	%xmm1,(%rdi)
+	leaq	16(%rdi),%rdi
+	leaq	-16(%rdx),%rdx
 
 L$dec_key_inverse:
-	movups	(%rdx),%xmm0
-	movups	(%rdi),%xmm1
+	movups	(%rdi),%xmm0
+	movups	(%rdx),%xmm1
 .byte	102,15,56,219,192
 .byte	102,15,56,219,201
-	leaq	16(%rdx),%rdx
-	leaq	-16(%rdi),%rdi
-	movups	%xmm0,16(%rdi)
-	movups	%xmm1,-16(%rdx)
-	cmpq	%rdx,%rdi
+	leaq	16(%rdi),%rdi
+	leaq	-16(%rdx),%rdx
+	movups	%xmm0,16(%rdx)
+	movups	%xmm1,-16(%rdi)
+	cmpq	%rdi,%rdx
 	ja	L$dec_key_inverse
 
-	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm0
 .byte	102,15,56,219,192
 	pxor	%xmm1,%xmm1
-	movups	%xmm0,(%rdi)
+	movups	%xmm0,(%rdx)
 	pxor	%xmm0,%xmm0
-L$dec_key_ret:
-	addq	$8,%rsp
-
 	ret
 
 
-
 .globl	_aes_hw_set_encrypt_key
 .private_extern _aes_hw_set_encrypt_key
 
 .p2align	4
 _aes_hw_set_encrypt_key:
-__aesni_set_encrypt_key:
 
 
 _CET_ENDBR
@@ -2331,7 +2321,6 @@
 	xorps	%xmm1,%xmm2
 	ret
 
-
 .section	__DATA,__const
 .p2align	6
 L$bswap_mask:
diff --git a/gen/bcm/aesni-x86_64-linux.S b/gen/bcm/aesni-x86_64-linux.S
index fdbb28e..38ed6e7 100644
--- a/gen/bcm/aesni-x86_64-linux.S
+++ b/gen/bcm/aesni-x86_64-linux.S
@@ -1907,61 +1907,51 @@
 	ret
 .cfi_endproc	
 .size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
-.globl	aes_hw_set_decrypt_key
-.hidden aes_hw_set_decrypt_key
-.type	aes_hw_set_decrypt_key,@function
+.globl	aes_hw_encrypt_key_to_decrypt_key
+.hidden aes_hw_encrypt_key_to_decrypt_key
+.type	aes_hw_encrypt_key_to_decrypt_key,@function
 .align	16
-aes_hw_set_decrypt_key:
+aes_hw_encrypt_key_to_decrypt_key:
 .cfi_startproc	
-
 _CET_ENDBR
-	subq	$8,%rsp
-.cfi_adjust_cfa_offset	8
 
-
-	call	__aesni_set_encrypt_key
+	movl	240(%rdi),%esi
 	shll	$4,%esi
-	testl	%eax,%eax
-	jnz	.Ldec_key_ret
-	leaq	16(%rdx,%rsi,1),%rdi
 
-	movups	(%rdx),%xmm0
-	movups	(%rdi),%xmm1
-	movups	%xmm0,(%rdi)
-	movups	%xmm1,(%rdx)
-	leaq	16(%rdx),%rdx
-	leaq	-16(%rdi),%rdi
+	leaq	16(%rdi,%rsi,1),%rdx
+
+	movups	(%rdi),%xmm0
+	movups	(%rdx),%xmm1
+	movups	%xmm0,(%rdx)
+	movups	%xmm1,(%rdi)
+	leaq	16(%rdi),%rdi
+	leaq	-16(%rdx),%rdx
 
 .Ldec_key_inverse:
-	movups	(%rdx),%xmm0
-	movups	(%rdi),%xmm1
+	movups	(%rdi),%xmm0
+	movups	(%rdx),%xmm1
 .byte	102,15,56,219,192
 .byte	102,15,56,219,201
-	leaq	16(%rdx),%rdx
-	leaq	-16(%rdi),%rdi
-	movups	%xmm0,16(%rdi)
-	movups	%xmm1,-16(%rdx)
-	cmpq	%rdx,%rdi
+	leaq	16(%rdi),%rdi
+	leaq	-16(%rdx),%rdx
+	movups	%xmm0,16(%rdx)
+	movups	%xmm1,-16(%rdi)
+	cmpq	%rdi,%rdx
 	ja	.Ldec_key_inverse
 
-	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm0
 .byte	102,15,56,219,192
 	pxor	%xmm1,%xmm1
-	movups	%xmm0,(%rdi)
+	movups	%xmm0,(%rdx)
 	pxor	%xmm0,%xmm0
-.Ldec_key_ret:
-	addq	$8,%rsp
-.cfi_adjust_cfa_offset	-8
 	ret
 .cfi_endproc	
-
-.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.size	aes_hw_encrypt_key_to_decrypt_key,.-aes_hw_encrypt_key_to_decrypt_key
 .globl	aes_hw_set_encrypt_key
 .hidden aes_hw_set_encrypt_key
 .type	aes_hw_set_encrypt_key,@function
 .align	16
 aes_hw_set_encrypt_key:
-__aesni_set_encrypt_key:
 .cfi_startproc	
 
 _CET_ENDBR
@@ -2333,7 +2323,6 @@
 	xorps	%xmm1,%xmm2
 	ret
 .size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
-.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
 .section	.rodata
 .align	64
 .Lbswap_mask:
diff --git a/gen/bcm/aesni-x86_64-win.asm b/gen/bcm/aesni-x86_64-win.asm
index 6a14422..64dd6be 100644
--- a/gen/bcm/aesni-x86_64-win.asm
+++ b/gen/bcm/aesni-x86_64-win.asm
@@ -2012,59 +2012,49 @@
 	ret
 
 $L$SEH_end_aes_hw_cbc_encrypt:
-global	aes_hw_set_decrypt_key
+global	aes_hw_encrypt_key_to_decrypt_key
 
 ALIGN	16
-aes_hw_set_decrypt_key:
+aes_hw_encrypt_key_to_decrypt_key:
 
-$L$SEH_begin_aes_hw_set_decrypt_key_1:
 _CET_ENDBR
-	sub	rsp,8
 
-$L$SEH_prologue_aes_hw_set_decrypt_key_2:
-$L$SEH_endprologue_aes_hw_set_decrypt_key_3:
-	call	__aesni_set_encrypt_key
+	mov	edx,DWORD[240+rcx]
 	shl	edx,4
-	test	eax,eax
-	jnz	NEAR $L$dec_key_ret
-	lea	rcx,[16+rdx*1+r8]
 
-	movups	xmm0,XMMWORD[r8]
-	movups	xmm1,XMMWORD[rcx]
-	movups	XMMWORD[rcx],xmm0
-	movups	XMMWORD[r8],xmm1
-	lea	r8,[16+r8]
-	lea	rcx,[((-16))+rcx]
+	lea	r8,[16+rdx*1+rcx]
+
+	movups	xmm0,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[r8]
+	movups	XMMWORD[r8],xmm0
+	movups	XMMWORD[rcx],xmm1
+	lea	rcx,[16+rcx]
+	lea	r8,[((-16))+r8]
 
 $L$dec_key_inverse:
-	movups	xmm0,XMMWORD[r8]
-	movups	xmm1,XMMWORD[rcx]
+	movups	xmm0,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[r8]
 	DB	102,15,56,219,192
 	DB	102,15,56,219,201
-	lea	r8,[16+r8]
-	lea	rcx,[((-16))+rcx]
-	movups	XMMWORD[16+rcx],xmm0
-	movups	XMMWORD[(-16)+r8],xmm1
-	cmp	rcx,r8
+	lea	rcx,[16+rcx]
+	lea	r8,[((-16))+r8]
+	movups	XMMWORD[16+r8],xmm0
+	movups	XMMWORD[(-16)+rcx],xmm1
+	cmp	r8,rcx
 	ja	NEAR $L$dec_key_inverse
 
-	movups	xmm0,XMMWORD[r8]
+	movups	xmm0,XMMWORD[rcx]
 	DB	102,15,56,219,192
 	pxor	xmm1,xmm1
-	movups	XMMWORD[rcx],xmm0
+	movups	XMMWORD[r8],xmm0
 	pxor	xmm0,xmm0
-$L$dec_key_ret:
-	add	rsp,8
-
 	ret
 
-$L$SEH_end_aes_hw_set_decrypt_key_4:
 
 global	aes_hw_set_encrypt_key
 
 ALIGN	16
 aes_hw_set_encrypt_key:
-__aesni_set_encrypt_key:
 
 $L$SEH_begin_aes_hw_set_encrypt_key_1:
 _CET_ENDBR
@@ -2436,7 +2426,6 @@
 	xorps	xmm2,xmm1
 	ret
 
-
 section	.rdata rdata align=8
 ALIGN	64
 $L$bswap_mask:
@@ -2661,10 +2650,6 @@
 	DD	cbc_se_handler wrt ..imagebase
 section	.pdata
 ALIGN	4
-	DD	$L$SEH_begin_aes_hw_set_decrypt_key_1 wrt ..imagebase
-	DD	$L$SEH_end_aes_hw_set_decrypt_key_4 wrt ..imagebase
-	DD	$L$SEH_info_aes_hw_set_decrypt_key_0 wrt ..imagebase
-
 	DD	$L$SEH_begin_aes_hw_set_encrypt_key_1 wrt ..imagebase
 	DD	$L$SEH_end_aes_hw_set_encrypt_key_4 wrt ..imagebase
 	DD	$L$SEH_info_aes_hw_set_encrypt_key_0 wrt ..imagebase
@@ -2672,15 +2657,6 @@
 
 section	.xdata
 ALIGN	4
-$L$SEH_info_aes_hw_set_decrypt_key_0:
-	DB	1
-	DB	$L$SEH_endprologue_aes_hw_set_decrypt_key_3-$L$SEH_begin_aes_hw_set_decrypt_key_1
-	DB	1
-	DB	0
-	DB	$L$SEH_prologue_aes_hw_set_decrypt_key_2-$L$SEH_begin_aes_hw_set_decrypt_key_1
-	DB	2
-
-	DW	0
 $L$SEH_info_aes_hw_set_encrypt_key_0:
 	DB	1
 	DB	$L$SEH_endprologue_aes_hw_set_encrypt_key_3-$L$SEH_begin_aes_hw_set_encrypt_key_1