Remove OPENSSL_ia32cap_P references from AES-NI assembly

The AES-NI key schedule functions have two versions, dating to OpenSSL's
23f6eec71dbd472044db7dc854599f1de14a1f48. This cites RT#3576.
Unfortunately, OpenSSL purged their old RT bugs, without any archives,
so this context is now lost. Some archives of openssl-dev discussion
(also predating OpenSSL's archives) give most of the context:
https://groups.google.com/g/mailing.openssl.dev/c/OuFXwW4NfO8/m/7d2ZXVjkxVkJ

Broadly, although AES-NI has an aeskeygenassist instruction for the key
schedule, apparently it's overall faster to ignore it and use aesenclast
instead. But it's slower on older processors, so the assembly would
check for AVX && !XOP as a proxy. (Note we always set XOP to false, even
though this likely wasn't a capability check but a proxy for pre-Xen AMD
chips.)

It is unclear if the aeskeygenassist version is still worthwhile.
However, the aesenclast version requires SSSE3. SSSE3 long predates
AES-NI, but it's not clear if AES-NI implies SSSE3. In OpenSSL, the CCM
AES-NI assembly seems to assume it does. For now, I've preserved the
pair of them.

There are now only two assembly files with OPENSSL_ia32cap_P references!

Bug: 673
Change-Id: I990b1393d780db4caf074c184ce8bbd182da6e29
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/68690
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/aes.c b/crypto/fipsmodule/aes/aes.c
index 56dfbe2..7eab5ac 100644
--- a/crypto/fipsmodule/aes/aes.c
+++ b/crypto/fipsmodule/aes/aes.c
@@ -116,4 +116,12 @@
   }
   return ret;
 }
+
+int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, AES_KEY *key) {
+  if (aes_hw_set_encrypt_key_alt_preferred()) {
+    return aes_hw_set_encrypt_key_alt(user_key, bits, key);
+  } else {
+    return aes_hw_set_encrypt_key_base(user_key, bits, key);
+  }
+}
 #endif
diff --git a/crypto/fipsmodule/aes/aes_test.cc b/crypto/fipsmodule/aes/aes_test.cc
index dc90067..07feabb 100644
--- a/crypto/fipsmodule/aes/aes_test.cc
+++ b/crypto/fipsmodule/aes/aes_test.cc
@@ -347,7 +347,16 @@
       }
 
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
-      ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_encrypt_key, kKey, bits, &key), 0);
+      ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_encrypt_key_base, kKey, bits, &key), 0);
+      if (aes_hw_set_encrypt_key_alt_capable()) {
+        AES_KEY alt;
+        ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_encrypt_key_alt, kKey, bits, &alt),
+                  0);
+        EXPECT_EQ(alt.rounds, key.rounds);
+        for (unsigned i = 0; i <= alt.rounds; i++) {
+          EXPECT_EQ(alt.rd_key[i], key.rd_key[i]);
+        }
+      }
       CHECK_ABI_SEH(aes_hw_encrypt_key_to_decrypt_key, &key);
 #else
       ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_decrypt_key, kKey, bits, &key), 0);
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86.pl b/crypto/fipsmodule/aes/asm/aesni-x86.pl
index d8fdfb8..077be94 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86.pl
@@ -83,7 +83,6 @@
 
 &asm_init($ARGV[0]);
 
-&external_label("OPENSSL_ia32cap_P");
 &preprocessor_ifdef("BORINGSSL_DISPATCH_TEST")
 &external_label("BORINGSSL_function_hit");
 &preprocessor_endif();
@@ -2109,18 +2108,15 @@
 
 ######################################################################
 # Mechanical port from aesni-x86_64.pl.
-#
-# _aesni_set_encrypt_key is private interface,
-# input:
-#	"eax"	const unsigned char *userKey
-#	$rounds	int bits
-#	$key	AES_KEY *key
-# output:
-#	"eax"	return code
-#	$round	rounds
 
-&function_begin_B("_aesni_set_encrypt_key");
-	&push	("ebp");
+# int $PREFIX_set_encrypt_key_base (const unsigned char *userKey, int bits,
+#                                   AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key_base");
+	&record_function_hit(3);
+
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
 	&push	("ebx");
 
 	&call	(&label("pic"));
@@ -2128,12 +2124,9 @@
 	&blindpop("ebx");
 	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
 
-	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
 	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
 	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
-	&mov	("ebp",&DWP(4,"ebp"));
 	&lea	($key,&DWP(16,$key));
-	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
 	&cmp	($rounds,256);
 	&je	(&label("14rounds"));
 	&cmp	($rounds,192);
@@ -2142,9 +2135,6 @@
 	&jne	(&label("bad_keybits"));
 
 &set_label("10rounds",16);
-	&cmp		("ebp",1<<28);
-	&je		(&label("10rounds_alt"));
-
 	&mov		($rounds,9);
 	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
 	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
@@ -2184,6 +2174,166 @@
 	&xorps		("xmm0","xmm1");
 	&ret();
 
+&set_label("12rounds",16);
+	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
+
+	&mov		($rounds,11);
+	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
+	&call		(&label("key_192a_cold"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
+	&call		(&label("key_192b"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(48,$key),$rounds);
+
+	&jmp	(&label("good_key"));
+
+&set_label("key_192a",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_192a_cold",16);
+	&movaps		("xmm5","xmm2");
+&set_label("key_192b_warm");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&movdqa		("xmm3","xmm2");
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&pslldq		("xmm3",4);
+	&xorps		("xmm0","xmm4");
+	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
+	&pxor		("xmm2","xmm3");
+	&pxor		("xmm0","xmm1");
+	&pshufd		("xmm3","xmm0",0b11111111);
+	&pxor		("xmm2","xmm3");
+	&ret();
+
+&set_label("key_192b",16);
+	&movaps		("xmm3","xmm0");
+	&shufps		("xmm5","xmm0",0b01000100);
+	&$movekey	(&QWP(0,$key),"xmm5");
+	&shufps		("xmm3","xmm2",0b01001110);
+	&$movekey	(&QWP(16,$key),"xmm3");
+	&lea		($key,&DWP(32,$key));
+	&jmp		(&label("key_192b_warm"));
+
+&set_label("14rounds",16);
+	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
+	&lea		($key,&DWP(16,$key));
+
+	&mov		($rounds,13);
+	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
+	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
+	&call		(&label("key_256a_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
+	&call		(&label("key_256a"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(16,$key),$rounds);
+	&xor		("eax","eax");
+
+	&jmp	(&label("good_key"));
+
+&set_label("key_256a",16);
+	&$movekey	(&QWP(0,$key),"xmm2");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("key_256b",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+
+	&shufps		("xmm4","xmm2",0b00010000);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm4","xmm2",0b10001100);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm1","xmm1",0b10101010);	# critical path
+	&xorps		("xmm2","xmm1");
+	&ret();
+
+&set_label("good_key");
+	&pxor	("xmm0","xmm0");
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&pxor	("xmm5","xmm5");
+	&xor	("eax","eax");
+	&pop	("ebx");
+	&ret	();
+
+&set_label("bad_keybits",4);
+	&pxor	("xmm0","xmm0");
+	&mov	("eax",-2);
+	&pop	("ebx");
+	&ret	();
+&function_end_B("${PREFIX}_set_encrypt_key_base");
+
+# int $PREFIX_set_encrypt_key_alt (const unsigned char *userKey, int bits,
+#                                  AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key_alt");
+	&record_function_hit(3);
+
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&push	("ebx");
+
+	&call	(&label("pic"));
+&set_label("pic");
+	&blindpop("ebx");
+	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
+
+	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
+	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
+	&lea	($key,&DWP(16,$key));
+	&cmp	($rounds,256);
+	&je	(&label("14rounds_alt"));
+	&cmp	($rounds,192);
+	&je	(&label("12rounds_alt"));
+	&cmp	($rounds,128);
+	&jne	(&label("bad_keybits"));
+
 &set_label("10rounds_alt",16);
 	&movdqa		("xmm5",&QWP(0x00,"ebx"));
 	&mov		($rounds,8);
@@ -2249,63 +2399,8 @@
 
 	&jmp	(&label("good_key"));
 
-&set_label("12rounds",16);
-	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
-	&cmp		("ebp",1<<28);
-	&je		(&label("12rounds_alt"));
-
-	&mov		($rounds,11);
-	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
-	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
-	&call		(&label("key_192a_cold"));
-	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
-	&call		(&label("key_192b"));
-	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
-	&call		(&label("key_192a"));
-	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
-	&call		(&label("key_192b"));
-	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
-	&call		(&label("key_192a"));
-	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
-	&call		(&label("key_192b"));
-	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
-	&call		(&label("key_192a"));
-	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
-	&call		(&label("key_192b"));
-	&$movekey	(&QWP(0,$key),"xmm0");
-	&mov		(&DWP(48,$key),$rounds);
-
-	&jmp	(&label("good_key"));
-
-&set_label("key_192a",16);
-	&$movekey	(&QWP(0,$key),"xmm0");
-	&lea		($key,&DWP(16,$key));
-&set_label("key_192a_cold",16);
-	&movaps		("xmm5","xmm2");
-&set_label("key_192b_warm");
-	&shufps		("xmm4","xmm0",0b00010000);
-	&movdqa		("xmm3","xmm2");
-	&xorps		("xmm0","xmm4");
-	&shufps		("xmm4","xmm0",0b10001100);
-	&pslldq		("xmm3",4);
-	&xorps		("xmm0","xmm4");
-	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
-	&pxor		("xmm2","xmm3");
-	&pxor		("xmm0","xmm1");
-	&pshufd		("xmm3","xmm0",0b11111111);
-	&pxor		("xmm2","xmm3");
-	&ret();
-
-&set_label("key_192b",16);
-	&movaps		("xmm3","xmm0");
-	&shufps		("xmm5","xmm0",0b01000100);
-	&$movekey	(&QWP(0,$key),"xmm5");
-	&shufps		("xmm3","xmm2",0b01001110);
-	&$movekey	(&QWP(16,$key),"xmm3");
-	&lea		($key,&DWP(32,$key));
-	&jmp		(&label("key_192b_warm"));
-
 &set_label("12rounds_alt",16);
+	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
 	&movdqa		("xmm5",&QWP(0x10,"ebx"));
 	&movdqa		("xmm4",&QWP(0x20,"ebx"));
 	&mov		($rounds,8);
@@ -2344,72 +2439,9 @@
 
 	&jmp	(&label("good_key"));
 
-&set_label("14rounds",16);
+&set_label("14rounds_alt",16);
 	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
 	&lea		($key,&DWP(16,$key));
-	&cmp		("ebp",1<<28);
-	&je		(&label("14rounds_alt"));
-
-	&mov		($rounds,13);
-	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
-	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
-	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
-	&call		(&label("key_256a_cold"));
-	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
-	&call		(&label("key_256b"));
-	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
-	&call		(&label("key_256a"));
-	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
-	&call		(&label("key_256b"));
-	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
-	&call		(&label("key_256a"));
-	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
-	&call		(&label("key_256b"));
-	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
-	&call		(&label("key_256a"));
-	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
-	&call		(&label("key_256b"));
-	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
-	&call		(&label("key_256a"));
-	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
-	&call		(&label("key_256b"));
-	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
-	&call		(&label("key_256a"));
-	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
-	&call		(&label("key_256b"));
-	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
-	&call		(&label("key_256a"));
-	&$movekey	(&QWP(0,$key),"xmm0");
-	&mov		(&DWP(16,$key),$rounds);
-	&xor		("eax","eax");
-
-	&jmp	(&label("good_key"));
-
-&set_label("key_256a",16);
-	&$movekey	(&QWP(0,$key),"xmm2");
-	&lea		($key,&DWP(16,$key));
-&set_label("key_256a_cold");
-	&shufps		("xmm4","xmm0",0b00010000);
-	&xorps		("xmm0","xmm4");
-	&shufps		("xmm4","xmm0",0b10001100);
-	&xorps		("xmm0","xmm4");
-	&shufps		("xmm1","xmm1",0b11111111);	# critical path
-	&xorps		("xmm0","xmm1");
-	&ret();
-
-&set_label("key_256b",16);
-	&$movekey	(&QWP(0,$key),"xmm0");
-	&lea		($key,&DWP(16,$key));
-
-	&shufps		("xmm4","xmm2",0b00010000);
-	&xorps		("xmm2","xmm4");
-	&shufps		("xmm4","xmm2",0b10001100);
-	&xorps		("xmm2","xmm4");
-	&shufps		("xmm1","xmm1",0b10101010);	# critical path
-	&xorps		("xmm2","xmm1");
-	&ret();
-
-&set_label("14rounds_alt",16);
 	&movdqa		("xmm5",&QWP(0x00,"ebx"));
 	&movdqa		("xmm4",&QWP(0x20,"ebx"));
 	&mov		($rounds,7);
@@ -2467,28 +2499,14 @@
 	&pxor	("xmm5","xmm5");
 	&xor	("eax","eax");
 	&pop	("ebx");
-	&pop	("ebp");
 	&ret	();
 
 &set_label("bad_keybits",4);
 	&pxor	("xmm0","xmm0");
 	&mov	("eax",-2);
 	&pop	("ebx");
-	&pop	("ebp");
 	&ret	();
-&function_end_B("_aesni_set_encrypt_key");
-
-# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
-#                              AES_KEY *key)
-&function_begin_B("${PREFIX}_set_encrypt_key");
-	&record_function_hit(3);
-
-	&mov	("eax",&wparam(0));
-	&mov	($rounds,&wparam(1));
-	&mov	($key,&wparam(2));
-	&call	("_aesni_set_encrypt_key");
-	&ret	();
-&function_end_B("${PREFIX}_set_encrypt_key");
+&function_end_B("${PREFIX}_set_encrypt_key_alt");
 
 # void $PREFIX_encrypt_key_to_decrypt_key (AES_KEY *key)
 &function_begin_B("${PREFIX}_encrypt_key_to_decrypt_key");
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 8b6036e..ab45749 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -211,7 +211,6 @@
 		("%rdi","%rsi","%rdx","%rcx");	# Unix order
 
 $code=".text\n";
-$code.=".extern	OPENSSL_ia32cap_P\n";
 
 $rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
@@ -3245,11 +3244,14 @@
 # are used. Note that it's declared "abi-omnipotent", which means that
 # amount of volatile registers is smaller on Windows.
 #
+# There are two variants of this function, one which uses aeskeygenassist
+# ("base") and one which uses aesenclast + pshufb ("alt"). See aes/internal.h
+# for details.
 $code.=<<___;
-.globl	${PREFIX}_set_encrypt_key
-.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
+.globl	${PREFIX}_set_encrypt_key_base
+.type	${PREFIX}_set_encrypt_key_base,\@abi-omnipotent
 .align	16
-${PREFIX}_set_encrypt_key:
+${PREFIX}_set_encrypt_key_base:
 .cfi_startproc
 .seh_startproc
 	_CET_ENDBR
@@ -3262,9 +3264,6 @@
 .seh_endprologue
 	movups	($inp),%xmm0		# pull first 128 bits of *userKey
 	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
-	leaq	OPENSSL_ia32cap_P(%rip),%r10
-	movl	4(%r10),%r10d
-	and	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
 	lea	16($key),%rax		# %rax is used as modifiable copy of $key
 	cmp	\$256,$bits
 	je	.L14rounds
@@ -3275,8 +3274,6 @@
 
 .L10rounds:
 	mov	\$9,$bits			# 10 rounds for 128-bit key
-	cmp	\$`1<<28`,%r10d			# AVX, bit no XOP
-	je	.L10rounds_alt
 
 	$movkey	%xmm0,($key)			# round 0
 	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
@@ -3305,78 +3302,9 @@
 	jmp	.Lenc_key_ret
 
 .align	16
-.L10rounds_alt:
-	movdqa	.Lkey_rotate(%rip),%xmm5
-	mov	\$8,%r10d
-	movdqa	.Lkey_rcon1(%rip),%xmm4
-	movdqa	%xmm0,%xmm2
-	movdqu	%xmm0,($key)
-	jmp	.Loop_key128
-
-.align	16
-.Loop_key128:
-	pshufb		%xmm5,%xmm0
-	aesenclast	%xmm4,%xmm0
-	pslld		\$1,%xmm4
-	lea		16(%rax),%rax
-
-	movdqa		%xmm2,%xmm3
-	pslldq		\$4,%xmm2
-	pxor		%xmm2,%xmm3
-	pslldq		\$4,%xmm2
-	pxor		%xmm2,%xmm3
-	pslldq		\$4,%xmm2
-	pxor		%xmm3,%xmm2
-
-	pxor		%xmm2,%xmm0
-	movdqu		%xmm0,-16(%rax)
-	movdqa		%xmm0,%xmm2
-
-	dec	%r10d
-	jnz	.Loop_key128
-
-	movdqa		.Lkey_rcon1b(%rip),%xmm4
-
-	pshufb		%xmm5,%xmm0
-	aesenclast	%xmm4,%xmm0
-	pslld		\$1,%xmm4
-
-	movdqa		%xmm2,%xmm3
-	pslldq		\$4,%xmm2
-	pxor		%xmm2,%xmm3
-	pslldq		\$4,%xmm2
-	pxor		%xmm2,%xmm3
-	pslldq		\$4,%xmm2
-	pxor		%xmm3,%xmm2
-
-	pxor		%xmm2,%xmm0
-	movdqu		%xmm0,(%rax)
-
-	movdqa		%xmm0,%xmm2
-	pshufb		%xmm5,%xmm0
-	aesenclast	%xmm4,%xmm0
-
-	movdqa		%xmm2,%xmm3
-	pslldq		\$4,%xmm2
-	pxor		%xmm2,%xmm3
-	pslldq		\$4,%xmm2
-	pxor		%xmm2,%xmm3
-	pslldq		\$4,%xmm2
-	pxor		%xmm3,%xmm2
-
-	pxor		%xmm2,%xmm0
-	movdqu		%xmm0,16(%rax)
-
-	mov	$bits,96(%rax)	# 240($key)
-	xor	%eax,%eax
-	jmp	.Lenc_key_ret
-
-.align	16
 .L12rounds:
 	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
 	mov	\$11,$bits			# 12 rounds for 192
-	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
-	je	.L12rounds_alt
 
 	$movkey	%xmm0,($key)			# round 0
 	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
@@ -3401,53 +3329,10 @@
 	jmp	.Lenc_key_ret
 
 .align	16
-.L12rounds_alt:
-	movdqa	.Lkey_rotate192(%rip),%xmm5
-	movdqa	.Lkey_rcon1(%rip),%xmm4
-	mov	\$8,%r10d
-	movdqu	%xmm0,($key)
-	jmp	.Loop_key192
-
-.align	16
-.Loop_key192:
-	movq		%xmm2,0(%rax)
-	movdqa		%xmm2,%xmm1
-	pshufb		%xmm5,%xmm2
-	aesenclast	%xmm4,%xmm2
-	pslld		\$1, %xmm4
-	lea		24(%rax),%rax
-
-	movdqa		%xmm0,%xmm3
-	pslldq		\$4,%xmm0
-	pxor		%xmm0,%xmm3
-	pslldq		\$4,%xmm0
-	pxor		%xmm0,%xmm3
-	pslldq		\$4,%xmm0
-	pxor		%xmm3,%xmm0
-
-	pshufd		\$0xff,%xmm0,%xmm3
-	pxor		%xmm1,%xmm3
-	pslldq		\$4,%xmm1
-	pxor		%xmm1,%xmm3
-
-	pxor		%xmm2,%xmm0
-	pxor		%xmm3,%xmm2
-	movdqu		%xmm0,-16(%rax)
-
-	dec	%r10d
-	jnz	.Loop_key192
-
-	mov	$bits,32(%rax)	# 240($key)
-	xor	%eax,%eax
-	jmp	.Lenc_key_ret
-
-.align	16
 .L14rounds:
 	movups	16($inp),%xmm2			# remaining half of *userKey
 	mov	\$13,$bits			# 14 rounds for 256
 	lea	16(%rax),%rax
-	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
-	je	.L14rounds_alt
 
 	$movkey	%xmm0,($key)			# round 0
 	$movkey	%xmm2,16($key)			# round 1
@@ -3483,60 +3368,6 @@
 	jmp	.Lenc_key_ret
 
 .align	16
-.L14rounds_alt:
-	movdqa	.Lkey_rotate(%rip),%xmm5
-	movdqa	.Lkey_rcon1(%rip),%xmm4
-	mov	\$7,%r10d
-	movdqu	%xmm0,0($key)
-	movdqa	%xmm2,%xmm1
-	movdqu	%xmm2,16($key)
-	jmp	.Loop_key256
-
-.align	16
-.Loop_key256:
-	pshufb		%xmm5,%xmm2
-	aesenclast	%xmm4,%xmm2
-
-	movdqa		%xmm0,%xmm3
-	pslldq		\$4,%xmm0
-	pxor		%xmm0,%xmm3
-	pslldq		\$4,%xmm0
-	pxor		%xmm0,%xmm3
-	pslldq		\$4,%xmm0
-	pxor		%xmm3,%xmm0
-	pslld		\$1,%xmm4
-
-	pxor		%xmm2,%xmm0
-	movdqu		%xmm0,(%rax)
-
-	dec	%r10d
-	jz	.Ldone_key256
-
-	pshufd		\$0xff,%xmm0,%xmm2
-	pxor		%xmm3,%xmm3
-	aesenclast	%xmm3,%xmm2
-
-	movdqa		%xmm1,%xmm3
-	pslldq		\$4,%xmm1
-	pxor		%xmm1,%xmm3
-	pslldq		\$4,%xmm1
-	pxor		%xmm1,%xmm3
-	pslldq		\$4,%xmm1
-	pxor		%xmm3,%xmm1
-
-	pxor		%xmm1,%xmm2
-	movdqu		%xmm2,16(%rax)
-	lea		32(%rax),%rax
-	movdqa		%xmm2,%xmm1
-
-	jmp	.Loop_key256
-
-.Ldone_key256:
-	mov	$bits,16(%rax)	# 240($key)
-	xor	%eax,%eax
-	jmp	.Lenc_key_ret
-
-.align	16
 .Lbad_keybits:
 	mov	\$-2,%rax
 .Lenc_key_ret:
@@ -3620,7 +3451,214 @@
 	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
 	xorps	%xmm1,%xmm2
 	ret
-.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size	${PREFIX}_set_encrypt_key_base,.-${PREFIX}_set_encrypt_key_base
+
+.globl	${PREFIX}_set_encrypt_key_alt
+.type	${PREFIX}_set_encrypt_key_alt,\@abi-omnipotent
+.align	16
+${PREFIX}_set_encrypt_key_alt:
+.cfi_startproc
+.seh_startproc
+	_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb \$1,BORINGSSL_function_hit+3(%rip)
+#endif
+	sub	\$8,%rsp
+.cfi_adjust_cfa_offset	8
+.seh_stackalloc	8
+.seh_endprologue
+	movups	($inp),%xmm0		# pull first 128 bits of *userKey
+	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
+	lea	16($key),%rax		# %rax is used as modifiable copy of $key
+	cmp	\$256,$bits
+	je	.L14rounds_alt
+	cmp	\$192,$bits
+	je	.L12rounds_alt
+	cmp	\$128,$bits
+	jne	.Lbad_keybits_alt
+
+	mov	\$9,$bits			# 10 rounds for 128-bit key
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	mov	\$8,%r10d
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,($key)
+	jmp	.Loop_key128
+
+.align	16
+.Loop_key128:
+	pshufb		%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
+	pslld		\$1,%xmm4
+	lea		16(%rax),%rax
+
+	movdqa		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm3,%xmm2
+
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,-16(%rax)
+	movdqa		%xmm0,%xmm2
+
+	dec	%r10d
+	jnz	.Loop_key128
+
+	movdqa		.Lkey_rcon1b(%rip),%xmm4
+
+	pshufb		%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
+	pslld		\$1,%xmm4
+
+	movdqa		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm3,%xmm2
+
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,(%rax)
+
+	movdqa		%xmm0,%xmm2
+	pshufb		%xmm5,%xmm0
+	aesenclast	%xmm4,%xmm0
+
+	movdqa		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm2,%xmm3
+	pslldq		\$4,%xmm2
+	pxor		%xmm3,%xmm2
+
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,16(%rax)
+
+	mov	$bits,96(%rax)	# 240($key)
+	xor	%eax,%eax
+	jmp	.Lenc_key_ret_alt
+
+.align	16
+.L12rounds_alt:
+	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
+	mov	\$11,$bits			# 12 rounds for 192
+	movdqa	.Lkey_rotate192(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	mov	\$8,%r10d
+	movdqu	%xmm0,($key)
+	jmp	.Loop_key192
+
+.align	16
+.Loop_key192:
+	movq		%xmm2,0(%rax)
+	movdqa		%xmm2,%xmm1
+	pshufb		%xmm5,%xmm2
+	aesenclast	%xmm4,%xmm2
+	pslld		\$1, %xmm4
+	lea		24(%rax),%rax
+
+	movdqa		%xmm0,%xmm3
+	pslldq		\$4,%xmm0
+	pxor		%xmm0,%xmm3
+	pslldq		\$4,%xmm0
+	pxor		%xmm0,%xmm3
+	pslldq		\$4,%xmm0
+	pxor		%xmm3,%xmm0
+
+	pshufd		\$0xff,%xmm0,%xmm3
+	pxor		%xmm1,%xmm3
+	pslldq		\$4,%xmm1
+	pxor		%xmm1,%xmm3
+
+	pxor		%xmm2,%xmm0
+	pxor		%xmm3,%xmm2
+	movdqu		%xmm0,-16(%rax)
+
+	dec	%r10d
+	jnz	.Loop_key192
+
+	mov	$bits,32(%rax)	# 240($key)
+	xor	%eax,%eax
+	jmp	.Lenc_key_ret_alt
+
+.align	16
+.L14rounds_alt:
+	movups	16($inp),%xmm2			# remaining half of *userKey
+	mov	\$13,$bits			# 14 rounds for 256
+	lea	16(%rax),%rax
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	mov	\$7,%r10d
+	movdqu	%xmm0,0($key)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,16($key)
+	jmp	.Loop_key256
+
+.align	16
+.Loop_key256:
+	pshufb		%xmm5,%xmm2
+	aesenclast	%xmm4,%xmm2
+
+	movdqa		%xmm0,%xmm3
+	pslldq		\$4,%xmm0
+	pxor		%xmm0,%xmm3
+	pslldq		\$4,%xmm0
+	pxor		%xmm0,%xmm3
+	pslldq		\$4,%xmm0
+	pxor		%xmm3,%xmm0
+	pslld		\$1,%xmm4
+
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,(%rax)
+
+	dec	%r10d
+	jz	.Ldone_key256
+
+	pshufd		\$0xff,%xmm0,%xmm2
+	pxor		%xmm3,%xmm3
+	aesenclast	%xmm3,%xmm2
+
+	movdqa		%xmm1,%xmm3
+	pslldq		\$4,%xmm1
+	pxor		%xmm1,%xmm3
+	pslldq		\$4,%xmm1
+	pxor		%xmm1,%xmm3
+	pslldq		\$4,%xmm1
+	pxor		%xmm3,%xmm1
+
+	pxor		%xmm1,%xmm2
+	movdqu		%xmm2,16(%rax)
+	lea		32(%rax),%rax
+	movdqa		%xmm2,%xmm1
+
+	jmp	.Loop_key256
+
+.Ldone_key256:
+	mov	$bits,16(%rax)	# 240($key)
+	xor	%eax,%eax
+	jmp	.Lenc_key_ret_alt
+
+.align	16
+.Lbad_keybits_alt:
+	mov	\$-2,%rax
+.Lenc_key_ret_alt:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	add	\$8,%rsp
+.cfi_adjust_cfa_offset	-8
+	ret
+.cfi_endproc
+.seh_endproc
+.size	${PREFIX}_set_encrypt_key_alt,.-${PREFIX}_set_encrypt_key_alt
 ___
 }
 
diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h
index e7f55d2..7d2db3b 100644
--- a/crypto/fipsmodule/aes/internal.h
+++ b/crypto/fipsmodule/aes/internal.h
@@ -79,7 +79,27 @@
 // On x86 and x86_64, |aes_hw_set_decrypt_key| is implemented in terms of
 // |aes_hw_set_encrypt_key| and a conversion function.
 void aes_hw_encrypt_key_to_decrypt_key(AES_KEY *key);
-#endif
+
+// There are two variants of this function, one which uses aeskeygenassist
+// ("base") and one which uses aesenclast + pshufb ("alt"). aesenclast is
+// overall faster but is slower on some older processors. It doesn't use AVX,
+// but AVX is used as a proxy to detecting this. See
+// https://groups.google.com/g/mailing.openssl.dev/c/OuFXwW4NfO8/m/7d2ZXVjkxVkJ
+//
+// TODO(davidben): It is unclear if the aeskeygenassist version is still
+// worthwhile. However, the aesenclast version requires SSSE3. SSSE3 long
+// predates AES-NI, but it's not clear if AES-NI implies SSSE3. In OpenSSL, the
+// CCM AES-NI assembly seems to assume it does.
+OPENSSL_INLINE int aes_hw_set_encrypt_key_alt_capable(void) {
+  return hwaes_capable() && CRYPTO_is_SSSE3_capable();
+}
+OPENSSL_INLINE int aes_hw_set_encrypt_key_alt_preferred(void) {
+  return hwaes_capable() && CRYPTO_is_AVX_capable();
+}
+int aes_hw_set_encrypt_key_base(const uint8_t *user_key, int bits,
+                                AES_KEY *key);
+int aes_hw_set_encrypt_key_alt(const uint8_t *user_key, int bits, AES_KEY *key);
+#endif  // OPENSSL_X86 || OPENSSL_X86_64
 
 #else
 
diff --git a/crypto/perlasm/x86asm.pl b/crypto/perlasm/x86asm.pl
index f0d6310..b10eaf6 100644
--- a/crypto/perlasm/x86asm.pl
+++ b/crypto/perlasm/x86asm.pl
@@ -42,10 +42,10 @@
     &preprocessor_ifdef("BORINGSSL_DISPATCH_TEST");
     &push("ebx");
     &push("edx");
-    &call(&label("pic"));
-    &set_label("pic");
+    &call(&label("pic_for_function_hit"));
+    &set_label("pic_for_function_hit");
     &blindpop("ebx");
-    &lea("ebx",&DWP("BORINGSSL_function_hit+$index"."-".&label("pic"),"ebx"));
+    &lea("ebx",&DWP("BORINGSSL_function_hit+$index"."-".&label("pic_for_function_hit"),"ebx"));
     &mov("edx", 1);
     &movb(&BP(0, "ebx"), "dl");
     &pop("edx");
diff --git a/gen/bcm/aesni-x86-apple.S b/gen/bcm/aesni-x86-apple.S
index e64b4bb..dda66ed 100644
--- a/gen/bcm/aesni-x86-apple.S
+++ b/gen/bcm/aesni-x86-apple.S
@@ -15,10 +15,10 @@
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	L000pic
-L000pic:
+	call	L000pic_for_function_hit
+L000pic_for_function_hit:
 	popl	%ebx
-	leal	_BORINGSSL_function_hit+1-L000pic(%ebx),%ebx
+	leal	_BORINGSSL_function_hit+1-L000pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
@@ -820,10 +820,10 @@
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	L038pic
-L038pic:
+	call	L038pic_for_function_hit
+L038pic_for_function_hit:
 	popl	%ebx
-	leal	_BORINGSSL_function_hit+0-L038pic(%ebx),%ebx
+	leal	_BORINGSSL_function_hit+0-L038pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
@@ -2065,31 +2065,42 @@
 	popl	%ebx
 	popl	%ebp
 	ret
-.private_extern	__aesni_set_encrypt_key
+.globl	_aes_hw_set_encrypt_key_base
+.private_extern	_aes_hw_set_encrypt_key_base
 .align	4
-__aesni_set_encrypt_key:
-	pushl	%ebp
+_aes_hw_set_encrypt_key_base:
+L_aes_hw_set_encrypt_key_base_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
-	call	L093pic
-L093pic:
+	pushl	%edx
+	call	L093pic_for_function_hit
+L093pic_for_function_hit:
 	popl	%ebx
-	leal	Lkey_const-L093pic(%ebx),%ebx
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp
+	leal	_BORINGSSL_function_hit+3-L093pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	pushl	%ebx
+	call	L094pic
+L094pic:
+	popl	%ebx
+	leal	Lkey_const-L094pic(%ebx),%ebx
 	movups	(%eax),%xmm0
 	xorps	%xmm4,%xmm4
-	movl	4(%ebp),%ebp
 	leal	16(%edx),%edx
-	andl	$268437504,%ebp
 	cmpl	$256,%ecx
-	je	L09414rounds
+	je	L09514rounds
 	cmpl	$192,%ecx
-	je	L09512rounds
+	je	L09612rounds
 	cmpl	$128,%ecx
-	jne	L096bad_keybits
+	jne	L097bad_keybits
 .align	4,0x90
-L09710rounds:
-	cmpl	$268435456,%ebp
-	je	L09810rounds_alt
+L09810rounds:
 	movl	$9,%ecx
 	movups	%xmm0,-16(%edx)
 .byte	102,15,58,223,200,1
@@ -2128,13 +2139,176 @@
 	xorps	%xmm1,%xmm0
 	ret
 .align	4,0x90
-L09810rounds_alt:
+L09612rounds:
+	movq	16(%eax),%xmm2
+	movl	$11,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	L102key_192a_cold
+.byte	102,15,58,223,202,2
+	call	L103key_192b
+.byte	102,15,58,223,202,4
+	call	L104key_192a
+.byte	102,15,58,223,202,8
+	call	L103key_192b
+.byte	102,15,58,223,202,16
+	call	L104key_192a
+.byte	102,15,58,223,202,32
+	call	L103key_192b
+.byte	102,15,58,223,202,64
+	call	L104key_192a
+.byte	102,15,58,223,202,128
+	call	L103key_192b
+	movups	%xmm0,(%edx)
+	movl	%ecx,48(%edx)
+	jmp	L101good_key
+.align	4,0x90
+L104key_192a:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.align	4,0x90
+L102key_192a_cold:
+	movaps	%xmm2,%xmm5
+L105key_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+.align	4,0x90
+L103key_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%edx)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%edx)
+	leal	32(%edx),%edx
+	jmp	L105key_192b_warm
+.align	4,0x90
+L09514rounds:
+	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
+	movl	$13,%ecx
+	movups	%xmm0,-32(%edx)
+	movups	%xmm2,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	L106key_256a_cold
+.byte	102,15,58,223,200,1
+	call	L107key_256b
+.byte	102,15,58,223,202,2
+	call	L108key_256a
+.byte	102,15,58,223,200,2
+	call	L107key_256b
+.byte	102,15,58,223,202,4
+	call	L108key_256a
+.byte	102,15,58,223,200,4
+	call	L107key_256b
+.byte	102,15,58,223,202,8
+	call	L108key_256a
+.byte	102,15,58,223,200,8
+	call	L107key_256b
+.byte	102,15,58,223,202,16
+	call	L108key_256a
+.byte	102,15,58,223,200,16
+	call	L107key_256b
+.byte	102,15,58,223,202,32
+	call	L108key_256a
+.byte	102,15,58,223,200,32
+	call	L107key_256b
+.byte	102,15,58,223,202,64
+	call	L108key_256a
+	movups	%xmm0,(%edx)
+	movl	%ecx,16(%edx)
+	xorl	%eax,%eax
+	jmp	L101good_key
+.align	4,0x90
+L108key_256a:
+	movups	%xmm2,(%edx)
+	leal	16(%edx),%edx
+L106key_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	4,0x90
+L107key_256b:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+L101good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	ret
+.align	2,0x90
+L097bad_keybits:
+	pxor	%xmm0,%xmm0
+	movl	$-2,%eax
+	popl	%ebx
+	ret
+.globl	_aes_hw_set_encrypt_key_alt
+.private_extern	_aes_hw_set_encrypt_key_alt
+.align	4
+_aes_hw_set_encrypt_key_alt:
+L_aes_hw_set_encrypt_key_alt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L109pic_for_function_hit
+L109pic_for_function_hit:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+3-L109pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	pushl	%ebx
+	call	L110pic
+L110pic:
+	popl	%ebx
+	leal	Lkey_const-L110pic(%ebx),%ebx
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	leal	16(%edx),%edx
+	cmpl	$256,%ecx
+	je	L11114rounds_alt
+	cmpl	$192,%ecx
+	je	L11212rounds_alt
+	cmpl	$128,%ecx
+	jne	L113bad_keybits
+.align	4,0x90
+L11410rounds_alt:
 	movdqa	(%ebx),%xmm5
 	movl	$8,%ecx
 	movdqa	32(%ebx),%xmm4
 	movdqa	%xmm0,%xmm2
 	movdqu	%xmm0,-16(%edx)
-L102loop_key128:
+L115loop_key128:
 .byte	102,15,56,0,197
 .byte	102,15,56,221,196
 	pslld	$1,%xmm4
@@ -2150,7 +2324,7 @@
 	movdqu	%xmm0,-16(%edx)
 	movdqa	%xmm0,%xmm2
 	decl	%ecx
-	jnz	L102loop_key128
+	jnz	L115loop_key128
 	movdqa	48(%ebx),%xmm4
 .byte	102,15,56,0,197
 .byte	102,15,56,221,196
@@ -2178,69 +2352,15 @@
 	movdqu	%xmm0,16(%edx)
 	movl	$9,%ecx
 	movl	%ecx,96(%edx)
-	jmp	L101good_key
+	jmp	L116good_key
 .align	4,0x90
-L09512rounds:
+L11212rounds_alt:
 	movq	16(%eax),%xmm2
-	cmpl	$268435456,%ebp
-	je	L10312rounds_alt
-	movl	$11,%ecx
-	movups	%xmm0,-16(%edx)
-.byte	102,15,58,223,202,1
-	call	L104key_192a_cold
-.byte	102,15,58,223,202,2
-	call	L105key_192b
-.byte	102,15,58,223,202,4
-	call	L106key_192a
-.byte	102,15,58,223,202,8
-	call	L105key_192b
-.byte	102,15,58,223,202,16
-	call	L106key_192a
-.byte	102,15,58,223,202,32
-	call	L105key_192b
-.byte	102,15,58,223,202,64
-	call	L106key_192a
-.byte	102,15,58,223,202,128
-	call	L105key_192b
-	movups	%xmm0,(%edx)
-	movl	%ecx,48(%edx)
-	jmp	L101good_key
-.align	4,0x90
-L106key_192a:
-	movups	%xmm0,(%edx)
-	leal	16(%edx),%edx
-.align	4,0x90
-L104key_192a_cold:
-	movaps	%xmm2,%xmm5
-L107key_192b_warm:
-	shufps	$16,%xmm0,%xmm4
-	movdqa	%xmm2,%xmm3
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	pslldq	$4,%xmm3
-	xorps	%xmm4,%xmm0
-	pshufd	$85,%xmm1,%xmm1
-	pxor	%xmm3,%xmm2
-	pxor	%xmm1,%xmm0
-	pshufd	$255,%xmm0,%xmm3
-	pxor	%xmm3,%xmm2
-	ret
-.align	4,0x90
-L105key_192b:
-	movaps	%xmm0,%xmm3
-	shufps	$68,%xmm0,%xmm5
-	movups	%xmm5,(%edx)
-	shufps	$78,%xmm2,%xmm3
-	movups	%xmm3,16(%edx)
-	leal	32(%edx),%edx
-	jmp	L107key_192b_warm
-.align	4,0x90
-L10312rounds_alt:
 	movdqa	16(%ebx),%xmm5
 	movdqa	32(%ebx),%xmm4
 	movl	$8,%ecx
 	movdqu	%xmm0,-16(%edx)
-L108loop_key192:
+L117loop_key192:
 	movq	%xmm2,(%edx)
 	movdqa	%xmm2,%xmm1
 .byte	102,15,56,0,213
@@ -2262,81 +2382,21 @@
 	pxor	%xmm3,%xmm2
 	movdqu	%xmm0,-16(%edx)
 	decl	%ecx
-	jnz	L108loop_key192
+	jnz	L117loop_key192
 	movl	$11,%ecx
 	movl	%ecx,32(%edx)
-	jmp	L101good_key
+	jmp	L116good_key
 .align	4,0x90
-L09414rounds:
+L11114rounds_alt:
 	movups	16(%eax),%xmm2
 	leal	16(%edx),%edx
-	cmpl	$268435456,%ebp
-	je	L10914rounds_alt
-	movl	$13,%ecx
-	movups	%xmm0,-32(%edx)
-	movups	%xmm2,-16(%edx)
-.byte	102,15,58,223,202,1
-	call	L110key_256a_cold
-.byte	102,15,58,223,200,1
-	call	L111key_256b
-.byte	102,15,58,223,202,2
-	call	L112key_256a
-.byte	102,15,58,223,200,2
-	call	L111key_256b
-.byte	102,15,58,223,202,4
-	call	L112key_256a
-.byte	102,15,58,223,200,4
-	call	L111key_256b
-.byte	102,15,58,223,202,8
-	call	L112key_256a
-.byte	102,15,58,223,200,8
-	call	L111key_256b
-.byte	102,15,58,223,202,16
-	call	L112key_256a
-.byte	102,15,58,223,200,16
-	call	L111key_256b
-.byte	102,15,58,223,202,32
-	call	L112key_256a
-.byte	102,15,58,223,200,32
-	call	L111key_256b
-.byte	102,15,58,223,202,64
-	call	L112key_256a
-	movups	%xmm0,(%edx)
-	movl	%ecx,16(%edx)
-	xorl	%eax,%eax
-	jmp	L101good_key
-.align	4,0x90
-L112key_256a:
-	movups	%xmm2,(%edx)
-	leal	16(%edx),%edx
-L110key_256a_cold:
-	shufps	$16,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$255,%xmm1,%xmm1
-	xorps	%xmm1,%xmm0
-	ret
-.align	4,0x90
-L111key_256b:
-	movups	%xmm0,(%edx)
-	leal	16(%edx),%edx
-	shufps	$16,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$140,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$170,%xmm1,%xmm1
-	xorps	%xmm1,%xmm2
-	ret
-.align	4,0x90
-L10914rounds_alt:
 	movdqa	(%ebx),%xmm5
 	movdqa	32(%ebx),%xmm4
 	movl	$7,%ecx
 	movdqu	%xmm0,-32(%edx)
 	movdqa	%xmm2,%xmm1
 	movdqu	%xmm2,-16(%edx)
-L113loop_key256:
+L118loop_key256:
 .byte	102,15,56,0,213
 .byte	102,15,56,221,212
 	movdqa	%xmm0,%xmm3
@@ -2350,7 +2410,7 @@
 	pxor	%xmm2,%xmm0
 	movdqu	%xmm0,(%edx)
 	decl	%ecx
-	jz	L114done_key256
+	jz	L119done_key256
 	pshufd	$255,%xmm0,%xmm2
 	pxor	%xmm3,%xmm3
 .byte	102,15,56,221,211
@@ -2365,11 +2425,11 @@
 	movdqu	%xmm2,16(%edx)
 	leal	32(%edx),%edx
 	movdqa	%xmm2,%xmm1
-	jmp	L113loop_key256
-L114done_key256:
+	jmp	L118loop_key256
+L119done_key256:
 	movl	$13,%ecx
 	movl	%ecx,16(%edx)
-L101good_key:
+L116good_key:
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	pxor	%xmm2,%xmm2
@@ -2378,36 +2438,12 @@
 	pxor	%xmm5,%xmm5
 	xorl	%eax,%eax
 	popl	%ebx
-	popl	%ebp
 	ret
 .align	2,0x90
-L096bad_keybits:
+L113bad_keybits:
 	pxor	%xmm0,%xmm0
 	movl	$-2,%eax
 	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_aes_hw_set_encrypt_key
-.private_extern	_aes_hw_set_encrypt_key
-.align	4
-_aes_hw_set_encrypt_key:
-L_aes_hw_set_encrypt_key_begin:
-#ifdef BORINGSSL_DISPATCH_TEST
-	pushl	%ebx
-	pushl	%edx
-	call	L115pic
-L115pic:
-	popl	%ebx
-	leal	_BORINGSSL_function_hit+3-L115pic(%ebx),%ebx
-	movl	$1,%edx
-	movb	%dl,(%ebx)
-	popl	%edx
-	popl	%ebx
-#endif
-	movl	4(%esp),%eax
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	call	__aesni_set_encrypt_key
 	ret
 .globl	_aes_hw_encrypt_key_to_decrypt_key
 .private_extern	_aes_hw_encrypt_key_to_decrypt_key
@@ -2424,7 +2460,7 @@
 	movups	%xmm1,(%edx)
 	leal	16(%edx),%edx
 	leal	-16(%eax),%eax
-L116dec_key_inverse:
+L120dec_key_inverse:
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
 .byte	102,15,56,219,192
@@ -2434,7 +2470,7 @@
 	movups	%xmm0,16(%eax)
 	movups	%xmm1,-16(%edx)
 	cmpl	%edx,%eax
-	ja	L116dec_key_inverse
+	ja	L120dec_key_inverse
 	movups	(%edx),%xmm0
 .byte	102,15,56,219,192
 	movups	%xmm0,(%edx)
@@ -2451,8 +2487,4 @@
 .byte	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 .byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte	115,108,46,111,114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol	_OPENSSL_ia32cap_P
-.long	0
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/aesni-x86-linux.S b/gen/bcm/aesni-x86-linux.S
index 1f15c71..6fefe17 100644
--- a/gen/bcm/aesni-x86-linux.S
+++ b/gen/bcm/aesni-x86-linux.S
@@ -16,10 +16,10 @@
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	.L000pic
-.L000pic:
+	call	.L000pic_for_function_hit
+.L000pic_for_function_hit:
 	popl	%ebx
-	leal	BORINGSSL_function_hit+1-.L000pic(%ebx),%ebx
+	leal	BORINGSSL_function_hit+1-.L000pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
@@ -847,10 +847,10 @@
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	.L038pic
-.L038pic:
+	call	.L038pic_for_function_hit
+.L038pic_for_function_hit:
 	popl	%ebx
-	leal	BORINGSSL_function_hit+0-.L038pic(%ebx),%ebx
+	leal	BORINGSSL_function_hit+0-.L038pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
@@ -2099,32 +2099,43 @@
 	popl	%ebp
 	ret
 .size	aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin
-.hidden	_aesni_set_encrypt_key
-.type	_aesni_set_encrypt_key,@function
+.globl	aes_hw_set_encrypt_key_base
+.hidden	aes_hw_set_encrypt_key_base
+.type	aes_hw_set_encrypt_key_base,@function
 .align	16
-_aesni_set_encrypt_key:
-	pushl	%ebp
+aes_hw_set_encrypt_key_base:
+.L_aes_hw_set_encrypt_key_base_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
-	call	.L093pic
-.L093pic:
+	pushl	%edx
+	call	.L093pic_for_function_hit
+.L093pic_for_function_hit:
 	popl	%ebx
-	leal	.Lkey_const-.L093pic(%ebx),%ebx
-	leal	OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp
+	leal	BORINGSSL_function_hit+3-.L093pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	pushl	%ebx
+	call	.L094pic
+.L094pic:
+	popl	%ebx
+	leal	.Lkey_const-.L094pic(%ebx),%ebx
 	movups	(%eax),%xmm0
 	xorps	%xmm4,%xmm4
-	movl	4(%ebp),%ebp
 	leal	16(%edx),%edx
-	andl	$268437504,%ebp
 	cmpl	$256,%ecx
-	je	.L09414rounds
+	je	.L09514rounds
 	cmpl	$192,%ecx
-	je	.L09512rounds
+	je	.L09612rounds
 	cmpl	$128,%ecx
-	jne	.L096bad_keybits
+	jne	.L097bad_keybits
 .align	16
-.L09710rounds:
-	cmpl	$268435456,%ebp
-	je	.L09810rounds_alt
+.L09810rounds:
 	movl	$9,%ecx
 	movups	%xmm0,-16(%edx)
 .byte	102,15,58,223,200,1
@@ -2163,13 +2174,178 @@
 	xorps	%xmm1,%xmm0
 	ret
 .align	16
-.L09810rounds_alt:
+.L09612rounds:
+	movq	16(%eax),%xmm2
+	movl	$11,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	.L102key_192a_cold
+.byte	102,15,58,223,202,2
+	call	.L103key_192b
+.byte	102,15,58,223,202,4
+	call	.L104key_192a
+.byte	102,15,58,223,202,8
+	call	.L103key_192b
+.byte	102,15,58,223,202,16
+	call	.L104key_192a
+.byte	102,15,58,223,202,32
+	call	.L103key_192b
+.byte	102,15,58,223,202,64
+	call	.L104key_192a
+.byte	102,15,58,223,202,128
+	call	.L103key_192b
+	movups	%xmm0,(%edx)
+	movl	%ecx,48(%edx)
+	jmp	.L101good_key
+.align	16
+.L104key_192a:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.align	16
+.L102key_192a_cold:
+	movaps	%xmm2,%xmm5
+.L105key_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+.align	16
+.L103key_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%edx)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%edx)
+	leal	32(%edx),%edx
+	jmp	.L105key_192b_warm
+.align	16
+.L09514rounds:
+	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
+	movl	$13,%ecx
+	movups	%xmm0,-32(%edx)
+	movups	%xmm2,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	.L106key_256a_cold
+.byte	102,15,58,223,200,1
+	call	.L107key_256b
+.byte	102,15,58,223,202,2
+	call	.L108key_256a
+.byte	102,15,58,223,200,2
+	call	.L107key_256b
+.byte	102,15,58,223,202,4
+	call	.L108key_256a
+.byte	102,15,58,223,200,4
+	call	.L107key_256b
+.byte	102,15,58,223,202,8
+	call	.L108key_256a
+.byte	102,15,58,223,200,8
+	call	.L107key_256b
+.byte	102,15,58,223,202,16
+	call	.L108key_256a
+.byte	102,15,58,223,200,16
+	call	.L107key_256b
+.byte	102,15,58,223,202,32
+	call	.L108key_256a
+.byte	102,15,58,223,200,32
+	call	.L107key_256b
+.byte	102,15,58,223,202,64
+	call	.L108key_256a
+	movups	%xmm0,(%edx)
+	movl	%ecx,16(%edx)
+	xorl	%eax,%eax
+	jmp	.L101good_key
+.align	16
+.L108key_256a:
+	movups	%xmm2,(%edx)
+	leal	16(%edx),%edx
+.L106key_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	16
+.L107key_256b:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+.L101good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	ret
+.align	4
+.L097bad_keybits:
+	pxor	%xmm0,%xmm0
+	movl	$-2,%eax
+	popl	%ebx
+	ret
+.size	aes_hw_set_encrypt_key_base,.-.L_aes_hw_set_encrypt_key_base_begin
+.globl	aes_hw_set_encrypt_key_alt
+.hidden	aes_hw_set_encrypt_key_alt
+.type	aes_hw_set_encrypt_key_alt,@function
+.align	16
+aes_hw_set_encrypt_key_alt:
+.L_aes_hw_set_encrypt_key_alt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L109pic_for_function_hit
+.L109pic_for_function_hit:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+3-.L109pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	pushl	%ebx
+	call	.L110pic
+.L110pic:
+	popl	%ebx
+	leal	.Lkey_const-.L110pic(%ebx),%ebx
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	leal	16(%edx),%edx
+	cmpl	$256,%ecx
+	je	.L11114rounds_alt
+	cmpl	$192,%ecx
+	je	.L11212rounds_alt
+	cmpl	$128,%ecx
+	jne	.L113bad_keybits
+.align	16
+.L11410rounds_alt:
 	movdqa	(%ebx),%xmm5
 	movl	$8,%ecx
 	movdqa	32(%ebx),%xmm4
 	movdqa	%xmm0,%xmm2
 	movdqu	%xmm0,-16(%edx)
-.L102loop_key128:
+.L115loop_key128:
 .byte	102,15,56,0,197
 .byte	102,15,56,221,196
 	pslld	$1,%xmm4
@@ -2185,7 +2361,7 @@
 	movdqu	%xmm0,-16(%edx)
 	movdqa	%xmm0,%xmm2
 	decl	%ecx
-	jnz	.L102loop_key128
+	jnz	.L115loop_key128
 	movdqa	48(%ebx),%xmm4
 .byte	102,15,56,0,197
 .byte	102,15,56,221,196
@@ -2213,69 +2389,15 @@
 	movdqu	%xmm0,16(%edx)
 	movl	$9,%ecx
 	movl	%ecx,96(%edx)
-	jmp	.L101good_key
+	jmp	.L116good_key
 .align	16
-.L09512rounds:
+.L11212rounds_alt:
 	movq	16(%eax),%xmm2
-	cmpl	$268435456,%ebp
-	je	.L10312rounds_alt
-	movl	$11,%ecx
-	movups	%xmm0,-16(%edx)
-.byte	102,15,58,223,202,1
-	call	.L104key_192a_cold
-.byte	102,15,58,223,202,2
-	call	.L105key_192b
-.byte	102,15,58,223,202,4
-	call	.L106key_192a
-.byte	102,15,58,223,202,8
-	call	.L105key_192b
-.byte	102,15,58,223,202,16
-	call	.L106key_192a
-.byte	102,15,58,223,202,32
-	call	.L105key_192b
-.byte	102,15,58,223,202,64
-	call	.L106key_192a
-.byte	102,15,58,223,202,128
-	call	.L105key_192b
-	movups	%xmm0,(%edx)
-	movl	%ecx,48(%edx)
-	jmp	.L101good_key
-.align	16
-.L106key_192a:
-	movups	%xmm0,(%edx)
-	leal	16(%edx),%edx
-.align	16
-.L104key_192a_cold:
-	movaps	%xmm2,%xmm5
-.L107key_192b_warm:
-	shufps	$16,%xmm0,%xmm4
-	movdqa	%xmm2,%xmm3
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	pslldq	$4,%xmm3
-	xorps	%xmm4,%xmm0
-	pshufd	$85,%xmm1,%xmm1
-	pxor	%xmm3,%xmm2
-	pxor	%xmm1,%xmm0
-	pshufd	$255,%xmm0,%xmm3
-	pxor	%xmm3,%xmm2
-	ret
-.align	16
-.L105key_192b:
-	movaps	%xmm0,%xmm3
-	shufps	$68,%xmm0,%xmm5
-	movups	%xmm5,(%edx)
-	shufps	$78,%xmm2,%xmm3
-	movups	%xmm3,16(%edx)
-	leal	32(%edx),%edx
-	jmp	.L107key_192b_warm
-.align	16
-.L10312rounds_alt:
 	movdqa	16(%ebx),%xmm5
 	movdqa	32(%ebx),%xmm4
 	movl	$8,%ecx
 	movdqu	%xmm0,-16(%edx)
-.L108loop_key192:
+.L117loop_key192:
 	movq	%xmm2,(%edx)
 	movdqa	%xmm2,%xmm1
 .byte	102,15,56,0,213
@@ -2297,81 +2419,21 @@
 	pxor	%xmm3,%xmm2
 	movdqu	%xmm0,-16(%edx)
 	decl	%ecx
-	jnz	.L108loop_key192
+	jnz	.L117loop_key192
 	movl	$11,%ecx
 	movl	%ecx,32(%edx)
-	jmp	.L101good_key
+	jmp	.L116good_key
 .align	16
-.L09414rounds:
+.L11114rounds_alt:
 	movups	16(%eax),%xmm2
 	leal	16(%edx),%edx
-	cmpl	$268435456,%ebp
-	je	.L10914rounds_alt
-	movl	$13,%ecx
-	movups	%xmm0,-32(%edx)
-	movups	%xmm2,-16(%edx)
-.byte	102,15,58,223,202,1
-	call	.L110key_256a_cold
-.byte	102,15,58,223,200,1
-	call	.L111key_256b
-.byte	102,15,58,223,202,2
-	call	.L112key_256a
-.byte	102,15,58,223,200,2
-	call	.L111key_256b
-.byte	102,15,58,223,202,4
-	call	.L112key_256a
-.byte	102,15,58,223,200,4
-	call	.L111key_256b
-.byte	102,15,58,223,202,8
-	call	.L112key_256a
-.byte	102,15,58,223,200,8
-	call	.L111key_256b
-.byte	102,15,58,223,202,16
-	call	.L112key_256a
-.byte	102,15,58,223,200,16
-	call	.L111key_256b
-.byte	102,15,58,223,202,32
-	call	.L112key_256a
-.byte	102,15,58,223,200,32
-	call	.L111key_256b
-.byte	102,15,58,223,202,64
-	call	.L112key_256a
-	movups	%xmm0,(%edx)
-	movl	%ecx,16(%edx)
-	xorl	%eax,%eax
-	jmp	.L101good_key
-.align	16
-.L112key_256a:
-	movups	%xmm2,(%edx)
-	leal	16(%edx),%edx
-.L110key_256a_cold:
-	shufps	$16,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$255,%xmm1,%xmm1
-	xorps	%xmm1,%xmm0
-	ret
-.align	16
-.L111key_256b:
-	movups	%xmm0,(%edx)
-	leal	16(%edx),%edx
-	shufps	$16,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$140,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$170,%xmm1,%xmm1
-	xorps	%xmm1,%xmm2
-	ret
-.align	16
-.L10914rounds_alt:
 	movdqa	(%ebx),%xmm5
 	movdqa	32(%ebx),%xmm4
 	movl	$7,%ecx
 	movdqu	%xmm0,-32(%edx)
 	movdqa	%xmm2,%xmm1
 	movdqu	%xmm2,-16(%edx)
-.L113loop_key256:
+.L118loop_key256:
 .byte	102,15,56,0,213
 .byte	102,15,56,221,212
 	movdqa	%xmm0,%xmm3
@@ -2385,7 +2447,7 @@
 	pxor	%xmm2,%xmm0
 	movdqu	%xmm0,(%edx)
 	decl	%ecx
-	jz	.L114done_key256
+	jz	.L119done_key256
 	pshufd	$255,%xmm0,%xmm2
 	pxor	%xmm3,%xmm3
 .byte	102,15,56,221,211
@@ -2400,11 +2462,11 @@
 	movdqu	%xmm2,16(%edx)
 	leal	32(%edx),%edx
 	movdqa	%xmm2,%xmm1
-	jmp	.L113loop_key256
-.L114done_key256:
+	jmp	.L118loop_key256
+.L119done_key256:
 	movl	$13,%ecx
 	movl	%ecx,16(%edx)
-.L101good_key:
+.L116good_key:
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	pxor	%xmm2,%xmm2
@@ -2413,40 +2475,14 @@
 	pxor	%xmm5,%xmm5
 	xorl	%eax,%eax
 	popl	%ebx
-	popl	%ebp
 	ret
 .align	4
-.L096bad_keybits:
+.L113bad_keybits:
 	pxor	%xmm0,%xmm0
 	movl	$-2,%eax
 	popl	%ebx
-	popl	%ebp
 	ret
-.size	_aesni_set_encrypt_key,.-_aesni_set_encrypt_key
-.globl	aes_hw_set_encrypt_key
-.hidden	aes_hw_set_encrypt_key
-.type	aes_hw_set_encrypt_key,@function
-.align	16
-aes_hw_set_encrypt_key:
-.L_aes_hw_set_encrypt_key_begin:
-#ifdef BORINGSSL_DISPATCH_TEST
-	pushl	%ebx
-	pushl	%edx
-	call	.L115pic
-.L115pic:
-	popl	%ebx
-	leal	BORINGSSL_function_hit+3-.L115pic(%ebx),%ebx
-	movl	$1,%edx
-	movb	%dl,(%ebx)
-	popl	%edx
-	popl	%ebx
-#endif
-	movl	4(%esp),%eax
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	call	_aesni_set_encrypt_key
-	ret
-.size	aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin
+.size	aes_hw_set_encrypt_key_alt,.-.L_aes_hw_set_encrypt_key_alt_begin
 .globl	aes_hw_encrypt_key_to_decrypt_key
 .hidden	aes_hw_encrypt_key_to_decrypt_key
 .type	aes_hw_encrypt_key_to_decrypt_key,@function
@@ -2463,7 +2499,7 @@
 	movups	%xmm1,(%edx)
 	leal	16(%edx),%edx
 	leal	-16(%eax),%eax
-.L116dec_key_inverse:
+.L120dec_key_inverse:
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
 .byte	102,15,56,219,192
@@ -2473,7 +2509,7 @@
 	movups	%xmm0,16(%eax)
 	movups	%xmm1,-16(%edx)
 	cmpl	%edx,%eax
-	ja	.L116dec_key_inverse
+	ja	.L120dec_key_inverse
 	movups	(%edx),%xmm0
 .byte	102,15,56,219,192
 	movups	%xmm0,(%edx)
diff --git a/gen/bcm/aesni-x86-win.asm b/gen/bcm/aesni-x86-win.asm
index 2d32e77..4453afb 100644
--- a/gen/bcm/aesni-x86-win.asm
+++ b/gen/bcm/aesni-x86-win.asm
@@ -13,7 +13,6 @@
 %else
 section	.text	code
 %endif
-;extern	_OPENSSL_ia32cap_P
 %ifdef BORINGSSL_DISPATCH_TEST
 extern	_BORINGSSL_function_hit
 %endif
@@ -24,10 +23,10 @@
 %ifdef BORINGSSL_DISPATCH_TEST
 	push	ebx
 	push	edx
-	call	L$000pic
-L$000pic:
+	call	L$000pic_for_function_hit
+L$000pic_for_function_hit:
 	pop	ebx
-	lea	ebx,[(_BORINGSSL_function_hit+1-L$000pic)+ebx]
+	lea	ebx,[(_BORINGSSL_function_hit+1-L$000pic_for_function_hit)+ebx]
 	mov	edx,1
 	mov	BYTE [ebx],dl
 	pop	edx
@@ -816,10 +815,10 @@
 %ifdef BORINGSSL_DISPATCH_TEST
 	push	ebx
 	push	edx
-	call	L$038pic
-L$038pic:
+	call	L$038pic_for_function_hit
+L$038pic_for_function_hit:
 	pop	ebx
-	lea	ebx,[(_BORINGSSL_function_hit+0-L$038pic)+ebx]
+	lea	ebx,[(_BORINGSSL_function_hit+0-L$038pic_for_function_hit)+ebx]
 	mov	edx,1
 	mov	BYTE [ebx],dl
 	pop	edx
@@ -2058,30 +2057,41 @@
 	pop	ebx
 	pop	ebp
 	ret
+global	_aes_hw_set_encrypt_key_base
 align	16
-__aesni_set_encrypt_key:
-	push	ebp
+_aes_hw_set_encrypt_key_base:
+L$_aes_hw_set_encrypt_key_base_begin:
+%ifdef BORINGSSL_DISPATCH_TEST
 	push	ebx
-	call	L$093pic
-L$093pic:
+	push	edx
+	call	L$093pic_for_function_hit
+L$093pic_for_function_hit:
 	pop	ebx
-	lea	ebx,[(L$key_const-L$093pic)+ebx]
-	lea	ebp,[_OPENSSL_ia32cap_P]
+	lea	ebx,[(_BORINGSSL_function_hit+3-L$093pic_for_function_hit)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	mov	eax,DWORD [4+esp]
+	mov	ecx,DWORD [8+esp]
+	mov	edx,DWORD [12+esp]
+	push	ebx
+	call	L$094pic
+L$094pic:
+	pop	ebx
+	lea	ebx,[(L$key_const-L$094pic)+ebx]
 	movups	xmm0,[eax]
 	xorps	xmm4,xmm4
-	mov	ebp,DWORD [4+ebp]
 	lea	edx,[16+edx]
-	and	ebp,268437504
 	cmp	ecx,256
-	je	NEAR L$09414rounds
+	je	NEAR L$09514rounds
 	cmp	ecx,192
-	je	NEAR L$09512rounds
+	je	NEAR L$09612rounds
 	cmp	ecx,128
-	jne	NEAR L$096bad_keybits
+	jne	NEAR L$097bad_keybits
 align	16
-L$09710rounds:
-	cmp	ebp,268435456
-	je	NEAR L$09810rounds_alt
+L$09810rounds:
 	mov	ecx,9
 	movups	[edx-16],xmm0
 db	102,15,58,223,200,1
@@ -2120,13 +2130,175 @@
 	xorps	xmm0,xmm1
 	ret
 align	16
-L$09810rounds_alt:
+L$09612rounds:
+	movq	xmm2,[16+eax]
+	mov	ecx,11
+	movups	[edx-16],xmm0
+db	102,15,58,223,202,1
+	call	L$102key_192a_cold
+db	102,15,58,223,202,2
+	call	L$103key_192b
+db	102,15,58,223,202,4
+	call	L$104key_192a
+db	102,15,58,223,202,8
+	call	L$103key_192b
+db	102,15,58,223,202,16
+	call	L$104key_192a
+db	102,15,58,223,202,32
+	call	L$103key_192b
+db	102,15,58,223,202,64
+	call	L$104key_192a
+db	102,15,58,223,202,128
+	call	L$103key_192b
+	movups	[edx],xmm0
+	mov	DWORD [48+edx],ecx
+	jmp	NEAR L$101good_key
+align	16
+L$104key_192a:
+	movups	[edx],xmm0
+	lea	edx,[16+edx]
+align	16
+L$102key_192a_cold:
+	movaps	xmm5,xmm2
+L$105key_192b_warm:
+	shufps	xmm4,xmm0,16
+	movdqa	xmm3,xmm2
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	pslldq	xmm3,4
+	xorps	xmm0,xmm4
+	pshufd	xmm1,xmm1,85
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm0,255
+	pxor	xmm2,xmm3
+	ret
+align	16
+L$103key_192b:
+	movaps	xmm3,xmm0
+	shufps	xmm5,xmm0,68
+	movups	[edx],xmm5
+	shufps	xmm3,xmm2,78
+	movups	[16+edx],xmm3
+	lea	edx,[32+edx]
+	jmp	NEAR L$105key_192b_warm
+align	16
+L$09514rounds:
+	movups	xmm2,[16+eax]
+	lea	edx,[16+edx]
+	mov	ecx,13
+	movups	[edx-32],xmm0
+	movups	[edx-16],xmm2
+db	102,15,58,223,202,1
+	call	L$106key_256a_cold
+db	102,15,58,223,200,1
+	call	L$107key_256b
+db	102,15,58,223,202,2
+	call	L$108key_256a
+db	102,15,58,223,200,2
+	call	L$107key_256b
+db	102,15,58,223,202,4
+	call	L$108key_256a
+db	102,15,58,223,200,4
+	call	L$107key_256b
+db	102,15,58,223,202,8
+	call	L$108key_256a
+db	102,15,58,223,200,8
+	call	L$107key_256b
+db	102,15,58,223,202,16
+	call	L$108key_256a
+db	102,15,58,223,200,16
+	call	L$107key_256b
+db	102,15,58,223,202,32
+	call	L$108key_256a
+db	102,15,58,223,200,32
+	call	L$107key_256b
+db	102,15,58,223,202,64
+	call	L$108key_256a
+	movups	[edx],xmm0
+	mov	DWORD [16+edx],ecx
+	xor	eax,eax
+	jmp	NEAR L$101good_key
+align	16
+L$108key_256a:
+	movups	[edx],xmm2
+	lea	edx,[16+edx]
+L$106key_256a_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	ret
+align	16
+L$107key_256b:
+	movups	[edx],xmm0
+	lea	edx,[16+edx]
+	shufps	xmm4,xmm2,16
+	xorps	xmm2,xmm4
+	shufps	xmm4,xmm2,140
+	xorps	xmm2,xmm4
+	shufps	xmm1,xmm1,170
+	xorps	xmm2,xmm1
+	ret
+L$101good_key:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	xor	eax,eax
+	pop	ebx
+	ret
+align	4
+L$097bad_keybits:
+	pxor	xmm0,xmm0
+	mov	eax,-2
+	pop	ebx
+	ret
+global	_aes_hw_set_encrypt_key_alt
+align	16
+_aes_hw_set_encrypt_key_alt:
+L$_aes_hw_set_encrypt_key_alt_begin:
+%ifdef BORINGSSL_DISPATCH_TEST
+	push	ebx
+	push	edx
+	call	L$109pic_for_function_hit
+L$109pic_for_function_hit:
+	pop	ebx
+	lea	ebx,[(_BORINGSSL_function_hit+3-L$109pic_for_function_hit)+ebx]
+	mov	edx,1
+	mov	BYTE [ebx],dl
+	pop	edx
+	pop	ebx
+%endif
+	mov	eax,DWORD [4+esp]
+	mov	ecx,DWORD [8+esp]
+	mov	edx,DWORD [12+esp]
+	push	ebx
+	call	L$110pic
+L$110pic:
+	pop	ebx
+	lea	ebx,[(L$key_const-L$110pic)+ebx]
+	movups	xmm0,[eax]
+	xorps	xmm4,xmm4
+	lea	edx,[16+edx]
+	cmp	ecx,256
+	je	NEAR L$11114rounds_alt
+	cmp	ecx,192
+	je	NEAR L$11212rounds_alt
+	cmp	ecx,128
+	jne	NEAR L$113bad_keybits
+align	16
+L$11410rounds_alt:
 	movdqa	xmm5,[ebx]
 	mov	ecx,8
 	movdqa	xmm4,[32+ebx]
 	movdqa	xmm2,xmm0
 	movdqu	[edx-16],xmm0
-L$102loop_key128:
+L$115loop_key128:
 db	102,15,56,0,197
 db	102,15,56,221,196
 	pslld	xmm4,1
@@ -2142,7 +2314,7 @@
 	movdqu	[edx-16],xmm0
 	movdqa	xmm2,xmm0
 	dec	ecx
-	jnz	NEAR L$102loop_key128
+	jnz	NEAR L$115loop_key128
 	movdqa	xmm4,[48+ebx]
 db	102,15,56,0,197
 db	102,15,56,221,196
@@ -2170,69 +2342,15 @@
 	movdqu	[16+edx],xmm0
 	mov	ecx,9
 	mov	DWORD [96+edx],ecx
-	jmp	NEAR L$101good_key
+	jmp	NEAR L$116good_key
 align	16
-L$09512rounds:
+L$11212rounds_alt:
 	movq	xmm2,[16+eax]
-	cmp	ebp,268435456
-	je	NEAR L$10312rounds_alt
-	mov	ecx,11
-	movups	[edx-16],xmm0
-db	102,15,58,223,202,1
-	call	L$104key_192a_cold
-db	102,15,58,223,202,2
-	call	L$105key_192b
-db	102,15,58,223,202,4
-	call	L$106key_192a
-db	102,15,58,223,202,8
-	call	L$105key_192b
-db	102,15,58,223,202,16
-	call	L$106key_192a
-db	102,15,58,223,202,32
-	call	L$105key_192b
-db	102,15,58,223,202,64
-	call	L$106key_192a
-db	102,15,58,223,202,128
-	call	L$105key_192b
-	movups	[edx],xmm0
-	mov	DWORD [48+edx],ecx
-	jmp	NEAR L$101good_key
-align	16
-L$106key_192a:
-	movups	[edx],xmm0
-	lea	edx,[16+edx]
-align	16
-L$104key_192a_cold:
-	movaps	xmm5,xmm2
-L$107key_192b_warm:
-	shufps	xmm4,xmm0,16
-	movdqa	xmm3,xmm2
-	xorps	xmm0,xmm4
-	shufps	xmm4,xmm0,140
-	pslldq	xmm3,4
-	xorps	xmm0,xmm4
-	pshufd	xmm1,xmm1,85
-	pxor	xmm2,xmm3
-	pxor	xmm0,xmm1
-	pshufd	xmm3,xmm0,255
-	pxor	xmm2,xmm3
-	ret
-align	16
-L$105key_192b:
-	movaps	xmm3,xmm0
-	shufps	xmm5,xmm0,68
-	movups	[edx],xmm5
-	shufps	xmm3,xmm2,78
-	movups	[16+edx],xmm3
-	lea	edx,[32+edx]
-	jmp	NEAR L$107key_192b_warm
-align	16
-L$10312rounds_alt:
 	movdqa	xmm5,[16+ebx]
 	movdqa	xmm4,[32+ebx]
 	mov	ecx,8
 	movdqu	[edx-16],xmm0
-L$108loop_key192:
+L$117loop_key192:
 	movq	[edx],xmm2
 	movdqa	xmm1,xmm2
 db	102,15,56,0,213
@@ -2254,81 +2372,21 @@
 	pxor	xmm2,xmm3
 	movdqu	[edx-16],xmm0
 	dec	ecx
-	jnz	NEAR L$108loop_key192
+	jnz	NEAR L$117loop_key192
 	mov	ecx,11
 	mov	DWORD [32+edx],ecx
-	jmp	NEAR L$101good_key
+	jmp	NEAR L$116good_key
 align	16
-L$09414rounds:
+L$11114rounds_alt:
 	movups	xmm2,[16+eax]
 	lea	edx,[16+edx]
-	cmp	ebp,268435456
-	je	NEAR L$10914rounds_alt
-	mov	ecx,13
-	movups	[edx-32],xmm0
-	movups	[edx-16],xmm2
-db	102,15,58,223,202,1
-	call	L$110key_256a_cold
-db	102,15,58,223,200,1
-	call	L$111key_256b
-db	102,15,58,223,202,2
-	call	L$112key_256a
-db	102,15,58,223,200,2
-	call	L$111key_256b
-db	102,15,58,223,202,4
-	call	L$112key_256a
-db	102,15,58,223,200,4
-	call	L$111key_256b
-db	102,15,58,223,202,8
-	call	L$112key_256a
-db	102,15,58,223,200,8
-	call	L$111key_256b
-db	102,15,58,223,202,16
-	call	L$112key_256a
-db	102,15,58,223,200,16
-	call	L$111key_256b
-db	102,15,58,223,202,32
-	call	L$112key_256a
-db	102,15,58,223,200,32
-	call	L$111key_256b
-db	102,15,58,223,202,64
-	call	L$112key_256a
-	movups	[edx],xmm0
-	mov	DWORD [16+edx],ecx
-	xor	eax,eax
-	jmp	NEAR L$101good_key
-align	16
-L$112key_256a:
-	movups	[edx],xmm2
-	lea	edx,[16+edx]
-L$110key_256a_cold:
-	shufps	xmm4,xmm0,16
-	xorps	xmm0,xmm4
-	shufps	xmm4,xmm0,140
-	xorps	xmm0,xmm4
-	shufps	xmm1,xmm1,255
-	xorps	xmm0,xmm1
-	ret
-align	16
-L$111key_256b:
-	movups	[edx],xmm0
-	lea	edx,[16+edx]
-	shufps	xmm4,xmm2,16
-	xorps	xmm2,xmm4
-	shufps	xmm4,xmm2,140
-	xorps	xmm2,xmm4
-	shufps	xmm1,xmm1,170
-	xorps	xmm2,xmm1
-	ret
-align	16
-L$10914rounds_alt:
 	movdqa	xmm5,[ebx]
 	movdqa	xmm4,[32+ebx]
 	mov	ecx,7
 	movdqu	[edx-32],xmm0
 	movdqa	xmm1,xmm2
 	movdqu	[edx-16],xmm2
-L$113loop_key256:
+L$118loop_key256:
 db	102,15,56,0,213
 db	102,15,56,221,212
 	movdqa	xmm3,xmm0
@@ -2342,7 +2400,7 @@
 	pxor	xmm0,xmm2
 	movdqu	[edx],xmm0
 	dec	ecx
-	jz	NEAR L$114done_key256
+	jz	NEAR L$119done_key256
 	pshufd	xmm2,xmm0,255
 	pxor	xmm3,xmm3
 db	102,15,56,221,211
@@ -2357,11 +2415,11 @@
 	movdqu	[16+edx],xmm2
 	lea	edx,[32+edx]
 	movdqa	xmm1,xmm2
-	jmp	NEAR L$113loop_key256
-L$114done_key256:
+	jmp	NEAR L$118loop_key256
+L$119done_key256:
 	mov	ecx,13
 	mov	DWORD [16+edx],ecx
-L$101good_key:
+L$116good_key:
 	pxor	xmm0,xmm0
 	pxor	xmm1,xmm1
 	pxor	xmm2,xmm2
@@ -2370,35 +2428,12 @@
 	pxor	xmm5,xmm5
 	xor	eax,eax
 	pop	ebx
-	pop	ebp
 	ret
 align	4
-L$096bad_keybits:
+L$113bad_keybits:
 	pxor	xmm0,xmm0
 	mov	eax,-2
 	pop	ebx
-	pop	ebp
-	ret
-global	_aes_hw_set_encrypt_key
-align	16
-_aes_hw_set_encrypt_key:
-L$_aes_hw_set_encrypt_key_begin:
-%ifdef BORINGSSL_DISPATCH_TEST
-	push	ebx
-	push	edx
-	call	L$115pic
-L$115pic:
-	pop	ebx
-	lea	ebx,[(_BORINGSSL_function_hit+3-L$115pic)+ebx]
-	mov	edx,1
-	mov	BYTE [ebx],dl
-	pop	edx
-	pop	ebx
-%endif
-	mov	eax,DWORD [4+esp]
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	call	__aesni_set_encrypt_key
 	ret
 global	_aes_hw_encrypt_key_to_decrypt_key
 align	16
@@ -2414,7 +2449,7 @@
 	movups	[edx],xmm1
 	lea	edx,[16+edx]
 	lea	eax,[eax-16]
-L$116dec_key_inverse:
+L$120dec_key_inverse:
 	movups	xmm0,[edx]
 	movups	xmm1,[eax]
 db	102,15,56,219,192
@@ -2424,7 +2459,7 @@
 	movups	[16+eax],xmm0
 	movups	[edx-16],xmm1
 	cmp	eax,edx
-	ja	NEAR L$116dec_key_inverse
+	ja	NEAR L$120dec_key_inverse
 	movups	xmm0,[edx]
 db	102,15,56,219,192
 	movups	[edx],xmm0
@@ -2441,8 +2476,6 @@
 db	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
 db	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 db	115,108,46,111,114,103,62,0
-segment	.bss
-common	_OPENSSL_ia32cap_P 16
 %else
 ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
 ret
diff --git a/gen/bcm/aesni-x86_64-apple.S b/gen/bcm/aesni-x86_64-apple.S
index ccf9f8f..23c15c3 100644
--- a/gen/bcm/aesni-x86_64-apple.S
+++ b/gen/bcm/aesni-x86_64-apple.S
@@ -5,7 +5,6 @@
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
 .text	
-
 .globl	_aes_hw_encrypt
 .private_extern _aes_hw_encrypt
 
@@ -1945,11 +1944,11 @@
 	ret
 
 
-.globl	_aes_hw_set_encrypt_key
-.private_extern _aes_hw_set_encrypt_key
+.globl	_aes_hw_set_encrypt_key_base
+.private_extern _aes_hw_set_encrypt_key_base
 
 .p2align	4
-_aes_hw_set_encrypt_key:
+_aes_hw_set_encrypt_key_base:
 
 
 _CET_ENDBR
@@ -1962,9 +1961,6 @@
 
 	movups	(%rdi),%xmm0
 	xorps	%xmm4,%xmm4
-	leaq	_OPENSSL_ia32cap_P(%rip),%r10
-	movl	4(%r10),%r10d
-	andl	$268437504,%r10d
 	leaq	16(%rdx),%rax
 	cmpl	$256,%esi
 	je	L$14rounds
@@ -1975,8 +1971,6 @@
 
 L$10rounds:
 	movl	$9,%esi
-	cmpl	$268435456,%r10d
-	je	L$10rounds_alt
 
 	movups	%xmm0,(%rdx)
 .byte	102,15,58,223,200,1
@@ -2005,78 +1999,9 @@
 	jmp	L$enc_key_ret
 
 .p2align	4
-L$10rounds_alt:
-	movdqa	L$key_rotate(%rip),%xmm5
-	movl	$8,%r10d
-	movdqa	L$key_rcon1(%rip),%xmm4
-	movdqa	%xmm0,%xmm2
-	movdqu	%xmm0,(%rdx)
-	jmp	L$oop_key128
-
-.p2align	4
-L$oop_key128:
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	pslld	$1,%xmm4
-	leaq	16(%rax),%rax
-
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,-16(%rax)
-	movdqa	%xmm0,%xmm2
-
-	decl	%r10d
-	jnz	L$oop_key128
-
-	movdqa	L$key_rcon1b(%rip),%xmm4
-
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	pslld	$1,%xmm4
-
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,(%rax)
-
-	movdqa	%xmm0,%xmm2
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,16(%rax)
-
-	movl	%esi,96(%rax)
-	xorl	%eax,%eax
-	jmp	L$enc_key_ret
-
-.p2align	4
 L$12rounds:
 	movq	16(%rdi),%xmm2
 	movl	$11,%esi
-	cmpl	$268435456,%r10d
-	je	L$12rounds_alt
 
 	movups	%xmm0,(%rdx)
 .byte	102,15,58,223,202,1
@@ -2101,53 +2026,10 @@
 	jmp	L$enc_key_ret
 
 .p2align	4
-L$12rounds_alt:
-	movdqa	L$key_rotate192(%rip),%xmm5
-	movdqa	L$key_rcon1(%rip),%xmm4
-	movl	$8,%r10d
-	movdqu	%xmm0,(%rdx)
-	jmp	L$oop_key192
-
-.p2align	4
-L$oop_key192:
-	movq	%xmm2,0(%rax)
-	movdqa	%xmm2,%xmm1
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
-	pslld	$1,%xmm4
-	leaq	24(%rax),%rax
-
-	movdqa	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm3,%xmm0
-
-	pshufd	$0xff,%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-
-	pxor	%xmm2,%xmm0
-	pxor	%xmm3,%xmm2
-	movdqu	%xmm0,-16(%rax)
-
-	decl	%r10d
-	jnz	L$oop_key192
-
-	movl	%esi,32(%rax)
-	xorl	%eax,%eax
-	jmp	L$enc_key_ret
-
-.p2align	4
 L$14rounds:
 	movups	16(%rdi),%xmm2
 	movl	$13,%esi
 	leaq	16(%rax),%rax
-	cmpl	$268435456,%r10d
-	je	L$14rounds_alt
 
 	movups	%xmm0,(%rdx)
 	movups	%xmm2,16(%rdx)
@@ -2183,60 +2065,6 @@
 	jmp	L$enc_key_ret
 
 .p2align	4
-L$14rounds_alt:
-	movdqa	L$key_rotate(%rip),%xmm5
-	movdqa	L$key_rcon1(%rip),%xmm4
-	movl	$7,%r10d
-	movdqu	%xmm0,0(%rdx)
-	movdqa	%xmm2,%xmm1
-	movdqu	%xmm2,16(%rdx)
-	jmp	L$oop_key256
-
-.p2align	4
-L$oop_key256:
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
-
-	movdqa	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm3,%xmm0
-	pslld	$1,%xmm4
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,(%rax)
-
-	decl	%r10d
-	jz	L$done_key256
-
-	pshufd	$0xff,%xmm0,%xmm2
-	pxor	%xmm3,%xmm3
-.byte	102,15,56,221,211
-
-	movdqa	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm3,%xmm1
-
-	pxor	%xmm1,%xmm2
-	movdqu	%xmm2,16(%rax)
-	leaq	32(%rax),%rax
-	movdqa	%xmm2,%xmm1
-
-	jmp	L$oop_key256
-
-L$done_key256:
-	movl	%esi,16(%rax)
-	xorl	%eax,%eax
-	jmp	L$enc_key_ret
-
-.p2align	4
 L$bad_keybits:
 	movq	$-2,%rax
 L$enc_key_ret:
@@ -2321,6 +2149,214 @@
 	xorps	%xmm1,%xmm2
 	ret
 
+
+.globl	_aes_hw_set_encrypt_key_alt
+.private_extern _aes_hw_set_encrypt_key_alt
+
+.p2align	4
+_aes_hw_set_encrypt_key_alt:
+
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,_BORINGSSL_function_hit+3(%rip)
+#endif
+	subq	$8,%rsp
+
+
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	L$14rounds_alt
+	cmpl	$192,%esi
+	je	L$12rounds_alt
+	cmpl	$128,%esi
+	jne	L$bad_keybits_alt
+
+	movl	$9,%esi
+	movdqa	L$key_rotate(%rip),%xmm5
+	movl	$8,%r10d
+	movdqa	L$key_rcon1(%rip),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,(%rdx)
+	jmp	L$oop_key128
+
+.p2align	4
+L$oop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leaq	16(%rax),%rax
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%rax)
+	movdqa	%xmm0,%xmm2
+
+	decl	%r10d
+	jnz	L$oop_key128
+
+	movdqa	L$key_rcon1b(%rip),%xmm4
+
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%rax)
+
+	movl	%esi,96(%rax)
+	xorl	%eax,%eax
+	jmp	L$enc_key_ret_alt
+
+.p2align	4
+L$12rounds_alt:
+	movq	16(%rdi),%xmm2
+	movl	$11,%esi
+	movdqa	L$key_rotate192(%rip),%xmm5
+	movdqa	L$key_rcon1(%rip),%xmm4
+	movl	$8,%r10d
+	movdqu	%xmm0,(%rdx)
+	jmp	L$oop_key192
+
+.p2align	4
+L$oop_key192:
+	movq	%xmm2,0(%rax)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leaq	24(%rax),%rax
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+
+	pshufd	$0xff,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%rax)
+
+	decl	%r10d
+	jnz	L$oop_key192
+
+	movl	%esi,32(%rax)
+	xorl	%eax,%eax
+	jmp	L$enc_key_ret_alt
+
+.p2align	4
+L$14rounds_alt:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+	movdqa	L$key_rotate(%rip),%xmm5
+	movdqa	L$key_rcon1(%rip),%xmm4
+	movl	$7,%r10d
+	movdqu	%xmm0,0(%rdx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,16(%rdx)
+	jmp	L$oop_key256
+
+.p2align	4
+L$oop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	decl	%r10d
+	jz	L$done_key256
+
+	pshufd	$0xff,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%rax)
+	leaq	32(%rax),%rax
+	movdqa	%xmm2,%xmm1
+
+	jmp	L$oop_key256
+
+L$done_key256:
+	movl	%esi,16(%rax)
+	xorl	%eax,%eax
+	jmp	L$enc_key_ret_alt
+
+.p2align	4
+L$bad_keybits_alt:
+	movq	$-2,%rax
+L$enc_key_ret_alt:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	addq	$8,%rsp
+
+	ret
+
+
+
 .section	__DATA,__const
 .p2align	6
 L$bswap_mask:
diff --git a/gen/bcm/aesni-x86_64-linux.S b/gen/bcm/aesni-x86_64-linux.S
index 38ed6e7..cd695b4 100644
--- a/gen/bcm/aesni-x86_64-linux.S
+++ b/gen/bcm/aesni-x86_64-linux.S
@@ -5,8 +5,6 @@
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
 .text	
-.extern	OPENSSL_ia32cap_P
-.hidden OPENSSL_ia32cap_P
 .globl	aes_hw_encrypt
 .hidden aes_hw_encrypt
 .type	aes_hw_encrypt,@function
@@ -1947,11 +1945,11 @@
 	ret
 .cfi_endproc	
 .size	aes_hw_encrypt_key_to_decrypt_key,.-aes_hw_encrypt_key_to_decrypt_key
-.globl	aes_hw_set_encrypt_key
-.hidden aes_hw_set_encrypt_key
-.type	aes_hw_set_encrypt_key,@function
+.globl	aes_hw_set_encrypt_key_base
+.hidden aes_hw_set_encrypt_key_base
+.type	aes_hw_set_encrypt_key_base,@function
 .align	16
-aes_hw_set_encrypt_key:
+aes_hw_set_encrypt_key_base:
 .cfi_startproc	
 
 _CET_ENDBR
@@ -1964,9 +1962,6 @@
 
 	movups	(%rdi),%xmm0
 	xorps	%xmm4,%xmm4
-	leaq	OPENSSL_ia32cap_P(%rip),%r10
-	movl	4(%r10),%r10d
-	andl	$268437504,%r10d
 	leaq	16(%rdx),%rax
 	cmpl	$256,%esi
 	je	.L14rounds
@@ -1977,8 +1972,6 @@
 
 .L10rounds:
 	movl	$9,%esi
-	cmpl	$268435456,%r10d
-	je	.L10rounds_alt
 
 	movups	%xmm0,(%rdx)
 .byte	102,15,58,223,200,1
@@ -2007,78 +2000,9 @@
 	jmp	.Lenc_key_ret
 
 .align	16
-.L10rounds_alt:
-	movdqa	.Lkey_rotate(%rip),%xmm5
-	movl	$8,%r10d
-	movdqa	.Lkey_rcon1(%rip),%xmm4
-	movdqa	%xmm0,%xmm2
-	movdqu	%xmm0,(%rdx)
-	jmp	.Loop_key128
-
-.align	16
-.Loop_key128:
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	pslld	$1,%xmm4
-	leaq	16(%rax),%rax
-
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,-16(%rax)
-	movdqa	%xmm0,%xmm2
-
-	decl	%r10d
-	jnz	.Loop_key128
-
-	movdqa	.Lkey_rcon1b(%rip),%xmm4
-
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	pslld	$1,%xmm4
-
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,(%rax)
-
-	movdqa	%xmm0,%xmm2
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,16(%rax)
-
-	movl	%esi,96(%rax)
-	xorl	%eax,%eax
-	jmp	.Lenc_key_ret
-
-.align	16
 .L12rounds:
 	movq	16(%rdi),%xmm2
 	movl	$11,%esi
-	cmpl	$268435456,%r10d
-	je	.L12rounds_alt
 
 	movups	%xmm0,(%rdx)
 .byte	102,15,58,223,202,1
@@ -2103,53 +2027,10 @@
 	jmp	.Lenc_key_ret
 
 .align	16
-.L12rounds_alt:
-	movdqa	.Lkey_rotate192(%rip),%xmm5
-	movdqa	.Lkey_rcon1(%rip),%xmm4
-	movl	$8,%r10d
-	movdqu	%xmm0,(%rdx)
-	jmp	.Loop_key192
-
-.align	16
-.Loop_key192:
-	movq	%xmm2,0(%rax)
-	movdqa	%xmm2,%xmm1
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
-	pslld	$1,%xmm4
-	leaq	24(%rax),%rax
-
-	movdqa	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm3,%xmm0
-
-	pshufd	$0xff,%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-
-	pxor	%xmm2,%xmm0
-	pxor	%xmm3,%xmm2
-	movdqu	%xmm0,-16(%rax)
-
-	decl	%r10d
-	jnz	.Loop_key192
-
-	movl	%esi,32(%rax)
-	xorl	%eax,%eax
-	jmp	.Lenc_key_ret
-
-.align	16
 .L14rounds:
 	movups	16(%rdi),%xmm2
 	movl	$13,%esi
 	leaq	16(%rax),%rax
-	cmpl	$268435456,%r10d
-	je	.L14rounds_alt
 
 	movups	%xmm0,(%rdx)
 	movups	%xmm2,16(%rdx)
@@ -2185,60 +2066,6 @@
 	jmp	.Lenc_key_ret
 
 .align	16
-.L14rounds_alt:
-	movdqa	.Lkey_rotate(%rip),%xmm5
-	movdqa	.Lkey_rcon1(%rip),%xmm4
-	movl	$7,%r10d
-	movdqu	%xmm0,0(%rdx)
-	movdqa	%xmm2,%xmm1
-	movdqu	%xmm2,16(%rdx)
-	jmp	.Loop_key256
-
-.align	16
-.Loop_key256:
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
-
-	movdqa	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm3,%xmm0
-	pslld	$1,%xmm4
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,(%rax)
-
-	decl	%r10d
-	jz	.Ldone_key256
-
-	pshufd	$0xff,%xmm0,%xmm2
-	pxor	%xmm3,%xmm3
-.byte	102,15,56,221,211
-
-	movdqa	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm3,%xmm1
-
-	pxor	%xmm1,%xmm2
-	movdqu	%xmm2,16(%rax)
-	leaq	32(%rax),%rax
-	movdqa	%xmm2,%xmm1
-
-	jmp	.Loop_key256
-
-.Ldone_key256:
-	movl	%esi,16(%rax)
-	xorl	%eax,%eax
-	jmp	.Lenc_key_ret
-
-.align	16
 .Lbad_keybits:
 	movq	$-2,%rax
 .Lenc_key_ret:
@@ -2322,7 +2149,215 @@
 	shufps	$170,%xmm1,%xmm1
 	xorps	%xmm1,%xmm2
 	ret
-.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+.size	aes_hw_set_encrypt_key_base,.-aes_hw_set_encrypt_key_base
+
+.globl	aes_hw_set_encrypt_key_alt
+.hidden aes_hw_set_encrypt_key_alt
+.type	aes_hw_set_encrypt_key_alt,@function
+.align	16
+aes_hw_set_encrypt_key_alt:
+.cfi_startproc	
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit+3(%rip)
+#endif
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	.L14rounds_alt
+	cmpl	$192,%esi
+	je	.L12rounds_alt
+	cmpl	$128,%esi
+	jne	.Lbad_keybits_alt
+
+	movl	$9,%esi
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movl	$8,%r10d
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,(%rdx)
+	jmp	.Loop_key128
+
+.align	16
+.Loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leaq	16(%rax),%rax
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%rax)
+	movdqa	%xmm0,%xmm2
+
+	decl	%r10d
+	jnz	.Loop_key128
+
+	movdqa	.Lkey_rcon1b(%rip),%xmm4
+
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%rax)
+
+	movl	%esi,96(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret_alt
+
+.align	16
+.L12rounds_alt:
+	movq	16(%rdi),%xmm2
+	movl	$11,%esi
+	movdqa	.Lkey_rotate192(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movl	$8,%r10d
+	movdqu	%xmm0,(%rdx)
+	jmp	.Loop_key192
+
+.align	16
+.Loop_key192:
+	movq	%xmm2,0(%rax)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leaq	24(%rax),%rax
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+
+	pshufd	$0xff,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%rax)
+
+	decl	%r10d
+	jnz	.Loop_key192
+
+	movl	%esi,32(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret_alt
+
+.align	16
+.L14rounds_alt:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movl	$7,%r10d
+	movdqu	%xmm0,0(%rdx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,16(%rdx)
+	jmp	.Loop_key256
+
+.align	16
+.Loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	decl	%r10d
+	jz	.Ldone_key256
+
+	pshufd	$0xff,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%rax)
+	leaq	32(%rax),%rax
+	movdqa	%xmm2,%xmm1
+
+	jmp	.Loop_key256
+
+.Ldone_key256:
+	movl	%esi,16(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret_alt
+
+.align	16
+.Lbad_keybits_alt:
+	movq	$-2,%rax
+.Lenc_key_ret_alt:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	addq	$8,%rsp
+.cfi_adjust_cfa_offset	-8
+	ret
+.cfi_endproc	
+
+.size	aes_hw_set_encrypt_key_alt,.-aes_hw_set_encrypt_key_alt
 .section	.rodata
 .align	64
 .Lbswap_mask:
diff --git a/gen/bcm/aesni-x86_64-win.asm b/gen/bcm/aesni-x86_64-win.asm
index 64dd6be..4876ed7 100644
--- a/gen/bcm/aesni-x86_64-win.asm
+++ b/gen/bcm/aesni-x86_64-win.asm
@@ -13,7 +13,6 @@
 %endif
 section	.text code align=64
 
-EXTERN	OPENSSL_ia32cap_P
 global	aes_hw_encrypt
 
 ALIGN	16
@@ -2051,25 +2050,22 @@
 	ret
 
 
-global	aes_hw_set_encrypt_key
+global	aes_hw_set_encrypt_key_base
 
 ALIGN	16
-aes_hw_set_encrypt_key:
+aes_hw_set_encrypt_key_base:
 
-$L$SEH_begin_aes_hw_set_encrypt_key_1:
+$L$SEH_begin_aes_hw_set_encrypt_key_base_1:
 _CET_ENDBR
 %ifdef BORINGSSL_DISPATCH_TEST
 	mov	BYTE[((BORINGSSL_function_hit+3))],1
 %endif
 	sub	rsp,8
 
-$L$SEH_prologue_aes_hw_set_encrypt_key_2:
-$L$SEH_endprologue_aes_hw_set_encrypt_key_3:
+$L$SEH_prologue_aes_hw_set_encrypt_key_base_2:
+$L$SEH_endprologue_aes_hw_set_encrypt_key_base_3:
 	movups	xmm0,XMMWORD[rcx]
 	xorps	xmm4,xmm4
-	lea	r10,[OPENSSL_ia32cap_P]
-	mov	r10d,DWORD[4+r10]
-	and	r10d,268437504
 	lea	rax,[16+r8]
 	cmp	edx,256
 	je	NEAR $L$14rounds
@@ -2080,8 +2076,6 @@
 
 $L$10rounds:
 	mov	edx,9
-	cmp	r10d,268435456
-	je	NEAR $L$10rounds_alt
 
 	movups	XMMWORD[r8],xmm0
 	DB	102,15,58,223,200,1
@@ -2110,78 +2104,9 @@
 	jmp	NEAR $L$enc_key_ret
 
 ALIGN	16
-$L$10rounds_alt:
-	movdqa	xmm5,XMMWORD[$L$key_rotate]
-	mov	r10d,8
-	movdqa	xmm4,XMMWORD[$L$key_rcon1]
-	movdqa	xmm2,xmm0
-	movdqu	XMMWORD[r8],xmm0
-	jmp	NEAR $L$oop_key128
-
-ALIGN	16
-$L$oop_key128:
-DB	102,15,56,0,197
-	DB	102,15,56,221,196
-	pslld	xmm4,1
-	lea	rax,[16+rax]
-
-	movdqa	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm2,xmm3
-
-	pxor	xmm0,xmm2
-	movdqu	XMMWORD[(-16)+rax],xmm0
-	movdqa	xmm2,xmm0
-
-	dec	r10d
-	jnz	NEAR $L$oop_key128
-
-	movdqa	xmm4,XMMWORD[$L$key_rcon1b]
-
-DB	102,15,56,0,197
-	DB	102,15,56,221,196
-	pslld	xmm4,1
-
-	movdqa	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm2,xmm3
-
-	pxor	xmm0,xmm2
-	movdqu	XMMWORD[rax],xmm0
-
-	movdqa	xmm2,xmm0
-DB	102,15,56,0,197
-	DB	102,15,56,221,196
-
-	movdqa	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm2,xmm3
-
-	pxor	xmm0,xmm2
-	movdqu	XMMWORD[16+rax],xmm0
-
-	mov	DWORD[96+rax],edx
-	xor	eax,eax
-	jmp	NEAR $L$enc_key_ret
-
-ALIGN	16
 $L$12rounds:
 	movq	xmm2,QWORD[16+rcx]
 	mov	edx,11
-	cmp	r10d,268435456
-	je	NEAR $L$12rounds_alt
 
 	movups	XMMWORD[r8],xmm0
 	DB	102,15,58,223,202,1
@@ -2206,53 +2131,10 @@
 	jmp	NEAR $L$enc_key_ret
 
 ALIGN	16
-$L$12rounds_alt:
-	movdqa	xmm5,XMMWORD[$L$key_rotate192]
-	movdqa	xmm4,XMMWORD[$L$key_rcon1]
-	mov	r10d,8
-	movdqu	XMMWORD[r8],xmm0
-	jmp	NEAR $L$oop_key192
-
-ALIGN	16
-$L$oop_key192:
-	movq	QWORD[rax],xmm2
-	movdqa	xmm1,xmm2
-DB	102,15,56,0,213
-	DB	102,15,56,221,212
-	pslld	xmm4,1
-	lea	rax,[24+rax]
-
-	movdqa	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm0,xmm3
-
-	pshufd	xmm3,xmm0,0xff
-	pxor	xmm3,xmm1
-	pslldq	xmm1,4
-	pxor	xmm3,xmm1
-
-	pxor	xmm0,xmm2
-	pxor	xmm2,xmm3
-	movdqu	XMMWORD[(-16)+rax],xmm0
-
-	dec	r10d
-	jnz	NEAR $L$oop_key192
-
-	mov	DWORD[32+rax],edx
-	xor	eax,eax
-	jmp	NEAR $L$enc_key_ret
-
-ALIGN	16
 $L$14rounds:
 	movups	xmm2,XMMWORD[16+rcx]
 	mov	edx,13
 	lea	rax,[16+rax]
-	cmp	r10d,268435456
-	je	NEAR $L$14rounds_alt
 
 	movups	XMMWORD[r8],xmm0
 	movups	XMMWORD[16+r8],xmm2
@@ -2288,60 +2170,6 @@
 	jmp	NEAR $L$enc_key_ret
 
 ALIGN	16
-$L$14rounds_alt:
-	movdqa	xmm5,XMMWORD[$L$key_rotate]
-	movdqa	xmm4,XMMWORD[$L$key_rcon1]
-	mov	r10d,7
-	movdqu	XMMWORD[r8],xmm0
-	movdqa	xmm1,xmm2
-	movdqu	XMMWORD[16+r8],xmm2
-	jmp	NEAR $L$oop_key256
-
-ALIGN	16
-$L$oop_key256:
-DB	102,15,56,0,213
-	DB	102,15,56,221,212
-
-	movdqa	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm0,xmm3
-	pslld	xmm4,1
-
-	pxor	xmm0,xmm2
-	movdqu	XMMWORD[rax],xmm0
-
-	dec	r10d
-	jz	NEAR $L$done_key256
-
-	pshufd	xmm2,xmm0,0xff
-	pxor	xmm3,xmm3
-	DB	102,15,56,221,211
-
-	movdqa	xmm3,xmm1
-	pslldq	xmm1,4
-	pxor	xmm3,xmm1
-	pslldq	xmm1,4
-	pxor	xmm3,xmm1
-	pslldq	xmm1,4
-	pxor	xmm1,xmm3
-
-	pxor	xmm2,xmm1
-	movdqu	XMMWORD[16+rax],xmm2
-	lea	rax,[32+rax]
-	movdqa	xmm1,xmm2
-
-	jmp	NEAR $L$oop_key256
-
-$L$done_key256:
-	mov	DWORD[16+rax],edx
-	xor	eax,eax
-	jmp	NEAR $L$enc_key_ret
-
-ALIGN	16
 $L$bad_keybits:
 	mov	rax,-2
 $L$enc_key_ret:
@@ -2355,7 +2183,7 @@
 
 	ret
 
-$L$SEH_end_aes_hw_set_encrypt_key_4:
+$L$SEH_end_aes_hw_set_encrypt_key_base_4:
 
 ALIGN	16
 $L$key_expansion_128:
@@ -2426,6 +2254,213 @@
 	xorps	xmm2,xmm1
 	ret
 
+
+global	aes_hw_set_encrypt_key_alt
+
+ALIGN	16
+aes_hw_set_encrypt_key_alt:
+
+$L$SEH_begin_aes_hw_set_encrypt_key_alt_1:
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+	mov	BYTE[((BORINGSSL_function_hit+3))],1
+%endif
+	sub	rsp,8
+
+$L$SEH_prologue_aes_hw_set_encrypt_key_alt_2:
+$L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3:
+	movups	xmm0,XMMWORD[rcx]
+	xorps	xmm4,xmm4
+	lea	rax,[16+r8]
+	cmp	edx,256
+	je	NEAR $L$14rounds_alt
+	cmp	edx,192
+	je	NEAR $L$12rounds_alt
+	cmp	edx,128
+	jne	NEAR $L$bad_keybits_alt
+
+	mov	edx,9
+	movdqa	xmm5,XMMWORD[$L$key_rotate]
+	mov	r10d,8
+	movdqa	xmm4,XMMWORD[$L$key_rcon1]
+	movdqa	xmm2,xmm0
+	movdqu	XMMWORD[r8],xmm0
+	jmp	NEAR $L$oop_key128
+
+ALIGN	16
+$L$oop_key128:
+DB	102,15,56,0,197
+	DB	102,15,56,221,196
+	pslld	xmm4,1
+	lea	rax,[16+rax]
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[(-16)+rax],xmm0
+	movdqa	xmm2,xmm0
+
+	dec	r10d
+	jnz	NEAR $L$oop_key128
+
+	movdqa	xmm4,XMMWORD[$L$key_rcon1b]
+
+DB	102,15,56,0,197
+	DB	102,15,56,221,196
+	pslld	xmm4,1
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[rax],xmm0
+
+	movdqa	xmm2,xmm0
+DB	102,15,56,0,197
+	DB	102,15,56,221,196
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[16+rax],xmm0
+
+	mov	DWORD[96+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret_alt
+
+ALIGN	16
+$L$12rounds_alt:
+	movq	xmm2,QWORD[16+rcx]
+	mov	edx,11
+	movdqa	xmm5,XMMWORD[$L$key_rotate192]
+	movdqa	xmm4,XMMWORD[$L$key_rcon1]
+	mov	r10d,8
+	movdqu	XMMWORD[r8],xmm0
+	jmp	NEAR $L$oop_key192
+
+ALIGN	16
+$L$oop_key192:
+	movq	QWORD[rax],xmm2
+	movdqa	xmm1,xmm2
+DB	102,15,56,0,213
+	DB	102,15,56,221,212
+	pslld	xmm4,1
+	lea	rax,[24+rax]
+
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm0,xmm3
+
+	pshufd	xmm3,xmm0,0xff
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+
+	pxor	xmm0,xmm2
+	pxor	xmm2,xmm3
+	movdqu	XMMWORD[(-16)+rax],xmm0
+
+	dec	r10d
+	jnz	NEAR $L$oop_key192
+
+	mov	DWORD[32+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret_alt
+
+ALIGN	16
+$L$14rounds_alt:
+	movups	xmm2,XMMWORD[16+rcx]
+	mov	edx,13
+	lea	rax,[16+rax]
+	movdqa	xmm5,XMMWORD[$L$key_rotate]
+	movdqa	xmm4,XMMWORD[$L$key_rcon1]
+	mov	r10d,7
+	movdqu	XMMWORD[r8],xmm0
+	movdqa	xmm1,xmm2
+	movdqu	XMMWORD[16+r8],xmm2
+	jmp	NEAR $L$oop_key256
+
+ALIGN	16
+$L$oop_key256:
+DB	102,15,56,0,213
+	DB	102,15,56,221,212
+
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm0,xmm3
+	pslld	xmm4,1
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[rax],xmm0
+
+	dec	r10d
+	jz	NEAR $L$done_key256
+
+	pshufd	xmm2,xmm0,0xff
+	pxor	xmm3,xmm3
+	DB	102,15,56,221,211
+
+	movdqa	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm1,xmm3
+
+	pxor	xmm2,xmm1
+	movdqu	XMMWORD[16+rax],xmm2
+	lea	rax,[32+rax]
+	movdqa	xmm1,xmm2
+
+	jmp	NEAR $L$oop_key256
+
+$L$done_key256:
+	mov	DWORD[16+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret_alt
+
+ALIGN	16
+$L$bad_keybits_alt:
+	mov	rax,-2
+$L$enc_key_ret_alt:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	add	rsp,8
+
+	ret
+
+$L$SEH_end_aes_hw_set_encrypt_key_alt_4:
+
 section	.rdata rdata align=8
 ALIGN	64
 $L$bswap_mask:
@@ -2650,19 +2685,32 @@
 	DD	cbc_se_handler wrt ..imagebase
 section	.pdata
 ALIGN	4
-	DD	$L$SEH_begin_aes_hw_set_encrypt_key_1 wrt ..imagebase
-	DD	$L$SEH_end_aes_hw_set_encrypt_key_4 wrt ..imagebase
-	DD	$L$SEH_info_aes_hw_set_encrypt_key_0 wrt ..imagebase
+	DD	$L$SEH_begin_aes_hw_set_encrypt_key_base_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_hw_set_encrypt_key_base_4 wrt ..imagebase
+	DD	$L$SEH_info_aes_hw_set_encrypt_key_base_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aes_hw_set_encrypt_key_alt_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_hw_set_encrypt_key_alt_4 wrt ..imagebase
+	DD	$L$SEH_info_aes_hw_set_encrypt_key_alt_0 wrt ..imagebase
 
 
 section	.xdata
 ALIGN	4
-$L$SEH_info_aes_hw_set_encrypt_key_0:
+$L$SEH_info_aes_hw_set_encrypt_key_base_0:
 	DB	1
-	DB	$L$SEH_endprologue_aes_hw_set_encrypt_key_3-$L$SEH_begin_aes_hw_set_encrypt_key_1
+	DB	$L$SEH_endprologue_aes_hw_set_encrypt_key_base_3-$L$SEH_begin_aes_hw_set_encrypt_key_base_1
 	DB	1
 	DB	0
-	DB	$L$SEH_prologue_aes_hw_set_encrypt_key_2-$L$SEH_begin_aes_hw_set_encrypt_key_1
+	DB	$L$SEH_prologue_aes_hw_set_encrypt_key_base_2-$L$SEH_begin_aes_hw_set_encrypt_key_base_1
+	DB	2
+
+	DW	0
+$L$SEH_info_aes_hw_set_encrypt_key_alt_0:
+	DB	1
+	DB	$L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1
+	DB	1
+	DB	0
+	DB	$L$SEH_prologue_aes_hw_set_encrypt_key_alt_2-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1
 	DB	2
 
 	DW	0
diff --git a/gen/bcm/vpaes-x86-apple.S b/gen/bcm/vpaes-x86-apple.S
index 4d2c485..02d3787 100644
--- a/gen/bcm/vpaes-x86-apple.S
+++ b/gen/bcm/vpaes-x86-apple.S
@@ -470,10 +470,10 @@
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	L016pic
-L016pic:
+	call	L016pic_for_function_hit
+L016pic_for_function_hit:
 	popl	%ebx
-	leal	_BORINGSSL_function_hit+5-L016pic(%ebx),%ebx
+	leal	_BORINGSSL_function_hit+5-L016pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
@@ -551,10 +551,10 @@
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	L019pic
-L019pic:
+	call	L019pic_for_function_hit
+L019pic_for_function_hit:
 	popl	%ebx
-	leal	_BORINGSSL_function_hit+4-L019pic(%ebx),%ebx
+	leal	_BORINGSSL_function_hit+4-L019pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
diff --git a/gen/bcm/vpaes-x86-linux.S b/gen/bcm/vpaes-x86-linux.S
index 02786a7..31dc9a0 100644
--- a/gen/bcm/vpaes-x86-linux.S
+++ b/gen/bcm/vpaes-x86-linux.S
@@ -487,10 +487,10 @@
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	.L016pic
-.L016pic:
+	call	.L016pic_for_function_hit
+.L016pic_for_function_hit:
 	popl	%ebx
-	leal	BORINGSSL_function_hit+5-.L016pic(%ebx),%ebx
+	leal	BORINGSSL_function_hit+5-.L016pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
@@ -572,10 +572,10 @@
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	.L019pic
-.L019pic:
+	call	.L019pic_for_function_hit
+.L019pic_for_function_hit:
 	popl	%ebx
-	leal	BORINGSSL_function_hit+4-.L019pic(%ebx),%ebx
+	leal	BORINGSSL_function_hit+4-.L019pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
diff --git a/gen/bcm/vpaes-x86-win.asm b/gen/bcm/vpaes-x86-win.asm
index 661496e..3f087e1 100644
--- a/gen/bcm/vpaes-x86-win.asm
+++ b/gen/bcm/vpaes-x86-win.asm
@@ -470,10 +470,10 @@
 %ifdef BORINGSSL_DISPATCH_TEST
 	push	ebx
 	push	edx
-	call	L$016pic
-L$016pic:
+	call	L$016pic_for_function_hit
+L$016pic_for_function_hit:
 	pop	ebx
-	lea	ebx,[(_BORINGSSL_function_hit+5-L$016pic)+ebx]
+	lea	ebx,[(_BORINGSSL_function_hit+5-L$016pic_for_function_hit)+ebx]
 	mov	edx,1
 	mov	BYTE [ebx],dl
 	pop	edx
@@ -549,10 +549,10 @@
 %ifdef BORINGSSL_DISPATCH_TEST
 	push	ebx
 	push	edx
-	call	L$019pic
-L$019pic:
+	call	L$019pic_for_function_hit
+L$019pic_for_function_hit:
 	pop	ebx
-	lea	ebx,[(_BORINGSSL_function_hit+4-L$019pic)+ebx]
+	lea	ebx,[(_BORINGSSL_function_hit+4-L$019pic_for_function_hit)+ebx]
 	mov	edx,1
 	mov	BYTE [ebx],dl
 	pop	edx