Remove OPENSSL_ia32cap_P references from AES-NI assembly
The AES-NI key schedule functions have two versions, dating to OpenSSL's
23f6eec71dbd472044db7dc854599f1de14a1f48. This cites RT#3576.
Unfortunately, OpenSSL purged their old RT bugs, without any archives,
so this context is now lost. Some archives of openssl-dev discussion
(also predating OpenSSL's archives) give most of the context:
https://groups.google.com/g/mailing.openssl.dev/c/OuFXwW4NfO8/m/7d2ZXVjkxVkJ
Broadly, although AES-NI has an aeskeygenassist instruction for the key
schedule, apparently it's overall faster to ignore it and use aesenclast
instead. But it's slower on older processors, so the assembly would
check for AVX && !XOP as a proxy. (Note we always set XOP to false, even
though this likely wasn't a capability check but a proxy for pre-Xen AMD
chips.)
It is unclear if the aeskeygenassist version is still worthwhile.
However, the aesenclast version requires SSSE3. SSSE3 long predates
AES-NI, but it's not clear if AES-NI implies SSSE3. In OpenSSL, the CCM
AES-NI assembly seems to assume it does. For now, I've preserved the
pair of them.
There are now only two assembly files with OPENSSL_ia32cap_P references!
Bug: 673
Change-Id: I990b1393d780db4caf074c184ce8bbd182da6e29
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/68690
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/aes.c b/crypto/fipsmodule/aes/aes.c
index 56dfbe2..7eab5ac 100644
--- a/crypto/fipsmodule/aes/aes.c
+++ b/crypto/fipsmodule/aes/aes.c
@@ -116,4 +116,12 @@
}
return ret;
}
+
+int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, AES_KEY *key) {
+ if (aes_hw_set_encrypt_key_alt_preferred()) {
+ return aes_hw_set_encrypt_key_alt(user_key, bits, key);
+ } else {
+ return aes_hw_set_encrypt_key_base(user_key, bits, key);
+ }
+}
#endif
diff --git a/crypto/fipsmodule/aes/aes_test.cc b/crypto/fipsmodule/aes/aes_test.cc
index dc90067..07feabb 100644
--- a/crypto/fipsmodule/aes/aes_test.cc
+++ b/crypto/fipsmodule/aes/aes_test.cc
@@ -347,7 +347,16 @@
}
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
- ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_encrypt_key, kKey, bits, &key), 0);
+ ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_encrypt_key_base, kKey, bits, &key), 0);
+ if (aes_hw_set_encrypt_key_alt_capable()) {
+ AES_KEY alt;
+ ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_encrypt_key_alt, kKey, bits, &alt),
+ 0);
+ EXPECT_EQ(alt.rounds, key.rounds);
+ for (unsigned i = 0; i <= alt.rounds; i++) {
+ EXPECT_EQ(alt.rd_key[i], key.rd_key[i]);
+ }
+ }
CHECK_ABI_SEH(aes_hw_encrypt_key_to_decrypt_key, &key);
#else
ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_decrypt_key, kKey, bits, &key), 0);
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86.pl b/crypto/fipsmodule/aes/asm/aesni-x86.pl
index d8fdfb8..077be94 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86.pl
@@ -83,7 +83,6 @@
&asm_init($ARGV[0]);
-&external_label("OPENSSL_ia32cap_P");
&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST")
&external_label("BORINGSSL_function_hit");
&preprocessor_endif();
@@ -2109,18 +2108,15 @@
######################################################################
# Mechanical port from aesni-x86_64.pl.
-#
-# _aesni_set_encrypt_key is private interface,
-# input:
-# "eax" const unsigned char *userKey
-# $rounds int bits
-# $key AES_KEY *key
-# output:
-# "eax" return code
-# $round rounds
-&function_begin_B("_aesni_set_encrypt_key");
- &push ("ebp");
+# int $PREFIX_set_encrypt_key_base (const unsigned char *userKey, int bits,
+# AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key_base");
+ &record_function_hit(3);
+
+ &mov ("eax",&wparam(0));
+ &mov ($rounds,&wparam(1));
+ &mov ($key,&wparam(2));
&push ("ebx");
&call (&label("pic"));
@@ -2128,12 +2124,9 @@
&blindpop("ebx");
&lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
- &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
- &mov ("ebp",&DWP(4,"ebp"));
&lea ($key,&DWP(16,$key));
- &and ("ebp",1<<28|1<<11); # AVX and XOP bits
&cmp ($rounds,256);
&je (&label("14rounds"));
&cmp ($rounds,192);
@@ -2142,9 +2135,6 @@
&jne (&label("bad_keybits"));
&set_label("10rounds",16);
- &cmp ("ebp",1<<28);
- &je (&label("10rounds_alt"));
-
&mov ($rounds,9);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm0",0x01); # round 1
@@ -2184,6 +2174,166 @@
&xorps ("xmm0","xmm1");
&ret();
+&set_label("12rounds",16);
+ &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
+
+ &mov ($rounds,11);
+ &$movekey (&QWP(-16,$key),"xmm0"); # round 0
+ &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
+ &call (&label("key_192a_cold"));
+ &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
+ &call (&label("key_192b"));
+ &$movekey (&QWP(0,$key),"xmm0");
+ &mov (&DWP(48,$key),$rounds);
+
+ &jmp (&label("good_key"));
+
+&set_label("key_192a",16);
+ &$movekey (&QWP(0,$key),"xmm0");
+ &lea ($key,&DWP(16,$key));
+&set_label("key_192a_cold",16);
+ &movaps ("xmm5","xmm2");
+&set_label("key_192b_warm");
+ &shufps ("xmm4","xmm0",0b00010000);
+ &movdqa ("xmm3","xmm2");
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &pslldq ("xmm3",4);
+ &xorps ("xmm0","xmm4");
+ &pshufd ("xmm1","xmm1",0b01010101); # critical path
+ &pxor ("xmm2","xmm3");
+ &pxor ("xmm0","xmm1");
+ &pshufd ("xmm3","xmm0",0b11111111);
+ &pxor ("xmm2","xmm3");
+ &ret();
+
+&set_label("key_192b",16);
+ &movaps ("xmm3","xmm0");
+ &shufps ("xmm5","xmm0",0b01000100);
+ &$movekey (&QWP(0,$key),"xmm5");
+ &shufps ("xmm3","xmm2",0b01001110);
+ &$movekey (&QWP(16,$key),"xmm3");
+ &lea ($key,&DWP(32,$key));
+ &jmp (&label("key_192b_warm"));
+
+&set_label("14rounds",16);
+ &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
+ &lea ($key,&DWP(16,$key));
+
+ &mov ($rounds,13);
+ &$movekey (&QWP(-32,$key),"xmm0"); # round 0
+ &$movekey (&QWP(-16,$key),"xmm2"); # round 1
+ &aeskeygenassist("xmm1","xmm2",0x01); # round 2
+ &call (&label("key_256a_cold"));
+ &aeskeygenassist("xmm1","xmm0",0x01); # round 3
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x02); # round 4
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x02); # round 5
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x04); # round 6
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x04); # round 7
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x08); # round 8
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x08); # round 9
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x10); # round 10
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x10); # round 11
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x20); # round 12
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x20); # round 13
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x40); # round 14
+ &call (&label("key_256a"));
+ &$movekey (&QWP(0,$key),"xmm0");
+ &mov (&DWP(16,$key),$rounds);
+ &xor ("eax","eax");
+
+ &jmp (&label("good_key"));
+
+&set_label("key_256a",16);
+ &$movekey (&QWP(0,$key),"xmm2");
+ &lea ($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+ &shufps ("xmm4","xmm0",0b00010000);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm1","xmm1",0b11111111); # critical path
+ &xorps ("xmm0","xmm1");
+ &ret();
+
+&set_label("key_256b",16);
+ &$movekey (&QWP(0,$key),"xmm0");
+ &lea ($key,&DWP(16,$key));
+
+ &shufps ("xmm4","xmm2",0b00010000);
+ &xorps ("xmm2","xmm4");
+ &shufps ("xmm4","xmm2",0b10001100);
+ &xorps ("xmm2","xmm4");
+ &shufps ("xmm1","xmm1",0b10101010); # critical path
+ &xorps ("xmm2","xmm1");
+ &ret();
+
+&set_label("good_key");
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &xor ("eax","eax");
+ &pop ("ebx");
+ &ret ();
+
+&set_label("bad_keybits",4);
+ &pxor ("xmm0","xmm0");
+ &mov ("eax",-2);
+ &pop ("ebx");
+ &ret ();
+&function_end_B("${PREFIX}_set_encrypt_key_base");
+
+# int $PREFIX_set_encrypt_key_alt (const unsigned char *userKey, int bits,
+# AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key_alt");
+ &record_function_hit(3);
+
+ &mov ("eax",&wparam(0));
+ &mov ($rounds,&wparam(1));
+ &mov ($key,&wparam(2));
+ &push ("ebx");
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop("ebx");
+ &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
+
+ &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
+ &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
+ &lea ($key,&DWP(16,$key));
+ &cmp ($rounds,256);
+ &je (&label("14rounds_alt"));
+ &cmp ($rounds,192);
+ &je (&label("12rounds_alt"));
+ &cmp ($rounds,128);
+ &jne (&label("bad_keybits"));
+
&set_label("10rounds_alt",16);
&movdqa ("xmm5",&QWP(0x00,"ebx"));
&mov ($rounds,8);
@@ -2249,63 +2399,8 @@
&jmp (&label("good_key"));
-&set_label("12rounds",16);
- &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
- &cmp ("ebp",1<<28);
- &je (&label("12rounds_alt"));
-
- &mov ($rounds,11);
- &$movekey (&QWP(-16,$key),"xmm0"); # round 0
- &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
- &call (&label("key_192a_cold"));
- &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
- &call (&label("key_192b"));
- &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
- &call (&label("key_192a"));
- &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
- &call (&label("key_192b"));
- &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
- &call (&label("key_192a"));
- &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
- &call (&label("key_192b"));
- &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
- &call (&label("key_192a"));
- &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
- &call (&label("key_192b"));
- &$movekey (&QWP(0,$key),"xmm0");
- &mov (&DWP(48,$key),$rounds);
-
- &jmp (&label("good_key"));
-
-&set_label("key_192a",16);
- &$movekey (&QWP(0,$key),"xmm0");
- &lea ($key,&DWP(16,$key));
-&set_label("key_192a_cold",16);
- &movaps ("xmm5","xmm2");
-&set_label("key_192b_warm");
- &shufps ("xmm4","xmm0",0b00010000);
- &movdqa ("xmm3","xmm2");
- &xorps ("xmm0","xmm4");
- &shufps ("xmm4","xmm0",0b10001100);
- &pslldq ("xmm3",4);
- &xorps ("xmm0","xmm4");
- &pshufd ("xmm1","xmm1",0b01010101); # critical path
- &pxor ("xmm2","xmm3");
- &pxor ("xmm0","xmm1");
- &pshufd ("xmm3","xmm0",0b11111111);
- &pxor ("xmm2","xmm3");
- &ret();
-
-&set_label("key_192b",16);
- &movaps ("xmm3","xmm0");
- &shufps ("xmm5","xmm0",0b01000100);
- &$movekey (&QWP(0,$key),"xmm5");
- &shufps ("xmm3","xmm2",0b01001110);
- &$movekey (&QWP(16,$key),"xmm3");
- &lea ($key,&DWP(32,$key));
- &jmp (&label("key_192b_warm"));
-
&set_label("12rounds_alt",16);
+ &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
&movdqa ("xmm5",&QWP(0x10,"ebx"));
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&mov ($rounds,8);
@@ -2344,72 +2439,9 @@
&jmp (&label("good_key"));
-&set_label("14rounds",16);
+&set_label("14rounds_alt",16);
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
&lea ($key,&DWP(16,$key));
- &cmp ("ebp",1<<28);
- &je (&label("14rounds_alt"));
-
- &mov ($rounds,13);
- &$movekey (&QWP(-32,$key),"xmm0"); # round 0
- &$movekey (&QWP(-16,$key),"xmm2"); # round 1
- &aeskeygenassist("xmm1","xmm2",0x01); # round 2
- &call (&label("key_256a_cold"));
- &aeskeygenassist("xmm1","xmm0",0x01); # round 3
- &call (&label("key_256b"));
- &aeskeygenassist("xmm1","xmm2",0x02); # round 4
- &call (&label("key_256a"));
- &aeskeygenassist("xmm1","xmm0",0x02); # round 5
- &call (&label("key_256b"));
- &aeskeygenassist("xmm1","xmm2",0x04); # round 6
- &call (&label("key_256a"));
- &aeskeygenassist("xmm1","xmm0",0x04); # round 7
- &call (&label("key_256b"));
- &aeskeygenassist("xmm1","xmm2",0x08); # round 8
- &call (&label("key_256a"));
- &aeskeygenassist("xmm1","xmm0",0x08); # round 9
- &call (&label("key_256b"));
- &aeskeygenassist("xmm1","xmm2",0x10); # round 10
- &call (&label("key_256a"));
- &aeskeygenassist("xmm1","xmm0",0x10); # round 11
- &call (&label("key_256b"));
- &aeskeygenassist("xmm1","xmm2",0x20); # round 12
- &call (&label("key_256a"));
- &aeskeygenassist("xmm1","xmm0",0x20); # round 13
- &call (&label("key_256b"));
- &aeskeygenassist("xmm1","xmm2",0x40); # round 14
- &call (&label("key_256a"));
- &$movekey (&QWP(0,$key),"xmm0");
- &mov (&DWP(16,$key),$rounds);
- &xor ("eax","eax");
-
- &jmp (&label("good_key"));
-
-&set_label("key_256a",16);
- &$movekey (&QWP(0,$key),"xmm2");
- &lea ($key,&DWP(16,$key));
-&set_label("key_256a_cold");
- &shufps ("xmm4","xmm0",0b00010000);
- &xorps ("xmm0","xmm4");
- &shufps ("xmm4","xmm0",0b10001100);
- &xorps ("xmm0","xmm4");
- &shufps ("xmm1","xmm1",0b11111111); # critical path
- &xorps ("xmm0","xmm1");
- &ret();
-
-&set_label("key_256b",16);
- &$movekey (&QWP(0,$key),"xmm0");
- &lea ($key,&DWP(16,$key));
-
- &shufps ("xmm4","xmm2",0b00010000);
- &xorps ("xmm2","xmm4");
- &shufps ("xmm4","xmm2",0b10001100);
- &xorps ("xmm2","xmm4");
- &shufps ("xmm1","xmm1",0b10101010); # critical path
- &xorps ("xmm2","xmm1");
- &ret();
-
-&set_label("14rounds_alt",16);
&movdqa ("xmm5",&QWP(0x00,"ebx"));
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&mov ($rounds,7);
@@ -2467,28 +2499,14 @@
&pxor ("xmm5","xmm5");
&xor ("eax","eax");
&pop ("ebx");
- &pop ("ebp");
&ret ();
&set_label("bad_keybits",4);
&pxor ("xmm0","xmm0");
&mov ("eax",-2);
&pop ("ebx");
- &pop ("ebp");
&ret ();
-&function_end_B("_aesni_set_encrypt_key");
-
-# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
-# AES_KEY *key)
-&function_begin_B("${PREFIX}_set_encrypt_key");
- &record_function_hit(3);
-
- &mov ("eax",&wparam(0));
- &mov ($rounds,&wparam(1));
- &mov ($key,&wparam(2));
- &call ("_aesni_set_encrypt_key");
- &ret ();
-&function_end_B("${PREFIX}_set_encrypt_key");
+&function_end_B("${PREFIX}_set_encrypt_key_alt");
# void $PREFIX_encrypt_key_to_decrypt_key (AES_KEY *key)
&function_begin_B("${PREFIX}_encrypt_key_to_decrypt_key");
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 8b6036e..ab45749 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -211,7 +211,6 @@
("%rdi","%rsi","%rdx","%rcx"); # Unix order
$code=".text\n";
-$code.=".extern OPENSSL_ia32cap_P\n";
$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
@@ -3245,11 +3244,14 @@
# are used. Note that it's declared "abi-omnipotent", which means that
# amount of volatile registers is smaller on Windows.
#
+# There are two variants of this function, one which uses aeskeygenassist
+# ("base") and one which uses aesenclast + pshufb ("alt"). See aes/internal.h
+# for details.
$code.=<<___;
-.globl ${PREFIX}_set_encrypt_key
-.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
+.globl ${PREFIX}_set_encrypt_key_base
+.type ${PREFIX}_set_encrypt_key_base,\@abi-omnipotent
.align 16
-${PREFIX}_set_encrypt_key:
+${PREFIX}_set_encrypt_key_base:
.cfi_startproc
.seh_startproc
_CET_ENDBR
@@ -3262,9 +3264,6 @@
.seh_endprologue
movups ($inp),%xmm0 # pull first 128 bits of *userKey
xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
- leaq OPENSSL_ia32cap_P(%rip),%r10
- movl 4(%r10),%r10d
- and \$`1<<28|1<<11`,%r10d # AVX and XOP bits
lea 16($key),%rax # %rax is used as modifiable copy of $key
cmp \$256,$bits
je .L14rounds
@@ -3275,8 +3274,6 @@
.L10rounds:
mov \$9,$bits # 10 rounds for 128-bit key
- cmp \$`1<<28`,%r10d # AVX, bit no XOP
- je .L10rounds_alt
$movkey %xmm0,($key) # round 0
aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
@@ -3305,78 +3302,9 @@
jmp .Lenc_key_ret
.align 16
-.L10rounds_alt:
- movdqa .Lkey_rotate(%rip),%xmm5
- mov \$8,%r10d
- movdqa .Lkey_rcon1(%rip),%xmm4
- movdqa %xmm0,%xmm2
- movdqu %xmm0,($key)
- jmp .Loop_key128
-
-.align 16
-.Loop_key128:
- pshufb %xmm5,%xmm0
- aesenclast %xmm4,%xmm0
- pslld \$1,%xmm4
- lea 16(%rax),%rax
-
- movdqa %xmm2,%xmm3
- pslldq \$4,%xmm2
- pxor %xmm2,%xmm3
- pslldq \$4,%xmm2
- pxor %xmm2,%xmm3
- pslldq \$4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,-16(%rax)
- movdqa %xmm0,%xmm2
-
- dec %r10d
- jnz .Loop_key128
-
- movdqa .Lkey_rcon1b(%rip),%xmm4
-
- pshufb %xmm5,%xmm0
- aesenclast %xmm4,%xmm0
- pslld \$1,%xmm4
-
- movdqa %xmm2,%xmm3
- pslldq \$4,%xmm2
- pxor %xmm2,%xmm3
- pslldq \$4,%xmm2
- pxor %xmm2,%xmm3
- pslldq \$4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,(%rax)
-
- movdqa %xmm0,%xmm2
- pshufb %xmm5,%xmm0
- aesenclast %xmm4,%xmm0
-
- movdqa %xmm2,%xmm3
- pslldq \$4,%xmm2
- pxor %xmm2,%xmm3
- pslldq \$4,%xmm2
- pxor %xmm2,%xmm3
- pslldq \$4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,16(%rax)
-
- mov $bits,96(%rax) # 240($key)
- xor %eax,%eax
- jmp .Lenc_key_ret
-
-.align 16
.L12rounds:
movq 16($inp),%xmm2 # remaining 1/3 of *userKey
mov \$11,$bits # 12 rounds for 192
- cmp \$`1<<28`,%r10d # AVX, but no XOP
- je .L12rounds_alt
$movkey %xmm0,($key) # round 0
aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
@@ -3401,53 +3329,10 @@
jmp .Lenc_key_ret
.align 16
-.L12rounds_alt:
- movdqa .Lkey_rotate192(%rip),%xmm5
- movdqa .Lkey_rcon1(%rip),%xmm4
- mov \$8,%r10d
- movdqu %xmm0,($key)
- jmp .Loop_key192
-
-.align 16
-.Loop_key192:
- movq %xmm2,0(%rax)
- movdqa %xmm2,%xmm1
- pshufb %xmm5,%xmm2
- aesenclast %xmm4,%xmm2
- pslld \$1, %xmm4
- lea 24(%rax),%rax
-
- movdqa %xmm0,%xmm3
- pslldq \$4,%xmm0
- pxor %xmm0,%xmm3
- pslldq \$4,%xmm0
- pxor %xmm0,%xmm3
- pslldq \$4,%xmm0
- pxor %xmm3,%xmm0
-
- pshufd \$0xff,%xmm0,%xmm3
- pxor %xmm1,%xmm3
- pslldq \$4,%xmm1
- pxor %xmm1,%xmm3
-
- pxor %xmm2,%xmm0
- pxor %xmm3,%xmm2
- movdqu %xmm0,-16(%rax)
-
- dec %r10d
- jnz .Loop_key192
-
- mov $bits,32(%rax) # 240($key)
- xor %eax,%eax
- jmp .Lenc_key_ret
-
-.align 16
.L14rounds:
movups 16($inp),%xmm2 # remaining half of *userKey
mov \$13,$bits # 14 rounds for 256
lea 16(%rax),%rax
- cmp \$`1<<28`,%r10d # AVX, but no XOP
- je .L14rounds_alt
$movkey %xmm0,($key) # round 0
$movkey %xmm2,16($key) # round 1
@@ -3483,60 +3368,6 @@
jmp .Lenc_key_ret
.align 16
-.L14rounds_alt:
- movdqa .Lkey_rotate(%rip),%xmm5
- movdqa .Lkey_rcon1(%rip),%xmm4
- mov \$7,%r10d
- movdqu %xmm0,0($key)
- movdqa %xmm2,%xmm1
- movdqu %xmm2,16($key)
- jmp .Loop_key256
-
-.align 16
-.Loop_key256:
- pshufb %xmm5,%xmm2
- aesenclast %xmm4,%xmm2
-
- movdqa %xmm0,%xmm3
- pslldq \$4,%xmm0
- pxor %xmm0,%xmm3
- pslldq \$4,%xmm0
- pxor %xmm0,%xmm3
- pslldq \$4,%xmm0
- pxor %xmm3,%xmm0
- pslld \$1,%xmm4
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,(%rax)
-
- dec %r10d
- jz .Ldone_key256
-
- pshufd \$0xff,%xmm0,%xmm2
- pxor %xmm3,%xmm3
- aesenclast %xmm3,%xmm2
-
- movdqa %xmm1,%xmm3
- pslldq \$4,%xmm1
- pxor %xmm1,%xmm3
- pslldq \$4,%xmm1
- pxor %xmm1,%xmm3
- pslldq \$4,%xmm1
- pxor %xmm3,%xmm1
-
- pxor %xmm1,%xmm2
- movdqu %xmm2,16(%rax)
- lea 32(%rax),%rax
- movdqa %xmm2,%xmm1
-
- jmp .Loop_key256
-
-.Ldone_key256:
- mov $bits,16(%rax) # 240($key)
- xor %eax,%eax
- jmp .Lenc_key_ret
-
-.align 16
.Lbad_keybits:
mov \$-2,%rax
.Lenc_key_ret:
@@ -3620,7 +3451,214 @@
shufps \$0b10101010,%xmm1,%xmm1 # critical path
xorps %xmm1,%xmm2
ret
-.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size ${PREFIX}_set_encrypt_key_base,.-${PREFIX}_set_encrypt_key_base
+
+.globl ${PREFIX}_set_encrypt_key_alt
+.type ${PREFIX}_set_encrypt_key_alt,\@abi-omnipotent
+.align 16
+${PREFIX}_set_encrypt_key_alt:
+.cfi_startproc
+.seh_startproc
+ _CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+ movb \$1,BORINGSSL_function_hit+3(%rip)
+#endif
+ sub \$8,%rsp
+.cfi_adjust_cfa_offset 8
+.seh_stackalloc 8
+.seh_endprologue
+ movups ($inp),%xmm0 # pull first 128 bits of *userKey
+ xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
+ lea 16($key),%rax # %rax is used as modifiable copy of $key
+ cmp \$256,$bits
+ je .L14rounds_alt
+ cmp \$192,$bits
+ je .L12rounds_alt
+ cmp \$128,$bits
+ jne .Lbad_keybits_alt
+
+ mov \$9,$bits # 10 rounds for 128-bit key
+ movdqa .Lkey_rotate(%rip),%xmm5
+ mov \$8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,($key)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+ pslld \$1,%xmm4
+ lea 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ dec %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+ pslld \$1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ mov $bits,96(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret_alt
+
+.align 16
+.L12rounds_alt:
+ movq 16($inp),%xmm2 # remaining 1/3 of *userKey
+ mov \$11,$bits # 12 rounds for 192
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ mov \$8,%r10d
+ movdqu %xmm0,($key)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
+ pslld \$1, %xmm4
+ lea 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd \$0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ dec %r10d
+ jnz .Loop_key192
+
+ mov $bits,32(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret_alt
+
+.align 16
+.L14rounds_alt:
+ movups 16($inp),%xmm2 # remaining half of *userKey
+ mov \$13,$bits # 14 rounds for 256
+ lea 16(%rax),%rax
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ mov \$7,%r10d
+ movdqu %xmm0,0($key)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16($key)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
+
+ movdqa %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld \$1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ dec %r10d
+ jz .Ldone_key256
+
+ pshufd \$0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+ aesenclast %xmm3,%xmm2
+
+ movdqa %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ lea 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ mov $bits,16(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret_alt
+
+.align 16
+.Lbad_keybits_alt:
+ mov \$-2,%rax
+.Lenc_key_ret_alt:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ add \$8,%rsp
+.cfi_adjust_cfa_offset -8
+ ret
+.cfi_endproc
+.seh_endproc
+.size ${PREFIX}_set_encrypt_key_alt,.-${PREFIX}_set_encrypt_key_alt
___
}
diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h
index e7f55d2..7d2db3b 100644
--- a/crypto/fipsmodule/aes/internal.h
+++ b/crypto/fipsmodule/aes/internal.h
@@ -79,7 +79,27 @@
// On x86 and x86_64, |aes_hw_set_decrypt_key| is implemented in terms of
// |aes_hw_set_encrypt_key| and a conversion function.
void aes_hw_encrypt_key_to_decrypt_key(AES_KEY *key);
-#endif
+
+// There are two variants of this function, one which uses aeskeygenassist
+// ("base") and one which uses aesenclast + pshufb ("alt"). aesenclast is
+// overall faster but is slower on some older processors. It doesn't use AVX,
+// but AVX is used as a proxy to detecting this. See
+// https://groups.google.com/g/mailing.openssl.dev/c/OuFXwW4NfO8/m/7d2ZXVjkxVkJ
+//
+// TODO(davidben): It is unclear if the aeskeygenassist version is still
+// worthwhile. However, the aesenclast version requires SSSE3. SSSE3 long
+// predates AES-NI, but it's not clear if AES-NI implies SSSE3. In OpenSSL, the
+// CCM AES-NI assembly seems to assume it does.
+OPENSSL_INLINE int aes_hw_set_encrypt_key_alt_capable(void) {
+ return hwaes_capable() && CRYPTO_is_SSSE3_capable();
+}
+OPENSSL_INLINE int aes_hw_set_encrypt_key_alt_preferred(void) {
+ return hwaes_capable() && CRYPTO_is_AVX_capable();
+}
+int aes_hw_set_encrypt_key_base(const uint8_t *user_key, int bits,
+ AES_KEY *key);
+int aes_hw_set_encrypt_key_alt(const uint8_t *user_key, int bits, AES_KEY *key);
+#endif // OPENSSL_X86 || OPENSSL_X86_64
#else
diff --git a/crypto/perlasm/x86asm.pl b/crypto/perlasm/x86asm.pl
index f0d6310..b10eaf6 100644
--- a/crypto/perlasm/x86asm.pl
+++ b/crypto/perlasm/x86asm.pl
@@ -42,10 +42,10 @@
&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST");
&push("ebx");
&push("edx");
- &call(&label("pic"));
- &set_label("pic");
+ &call(&label("pic_for_function_hit"));
+ &set_label("pic_for_function_hit");
&blindpop("ebx");
- &lea("ebx",&DWP("BORINGSSL_function_hit+$index"."-".&label("pic"),"ebx"));
+ &lea("ebx",&DWP("BORINGSSL_function_hit+$index"."-".&label("pic_for_function_hit"),"ebx"));
&mov("edx", 1);
&movb(&BP(0, "ebx"), "dl");
&pop("edx");
diff --git a/gen/bcm/aesni-x86-apple.S b/gen/bcm/aesni-x86-apple.S
index e64b4bb..dda66ed 100644
--- a/gen/bcm/aesni-x86-apple.S
+++ b/gen/bcm/aesni-x86-apple.S
@@ -15,10 +15,10 @@
#ifdef BORINGSSL_DISPATCH_TEST
pushl %ebx
pushl %edx
- call L000pic
-L000pic:
+ call L000pic_for_function_hit
+L000pic_for_function_hit:
popl %ebx
- leal _BORINGSSL_function_hit+1-L000pic(%ebx),%ebx
+ leal _BORINGSSL_function_hit+1-L000pic_for_function_hit(%ebx),%ebx
movl $1,%edx
movb %dl,(%ebx)
popl %edx
@@ -820,10 +820,10 @@
#ifdef BORINGSSL_DISPATCH_TEST
pushl %ebx
pushl %edx
- call L038pic
-L038pic:
+ call L038pic_for_function_hit
+L038pic_for_function_hit:
popl %ebx
- leal _BORINGSSL_function_hit+0-L038pic(%ebx),%ebx
+ leal _BORINGSSL_function_hit+0-L038pic_for_function_hit(%ebx),%ebx
movl $1,%edx
movb %dl,(%ebx)
popl %edx
@@ -2065,31 +2065,42 @@
popl %ebx
popl %ebp
ret
-.private_extern __aesni_set_encrypt_key
+.globl _aes_hw_set_encrypt_key_base
+.private_extern _aes_hw_set_encrypt_key_base
.align 4
-__aesni_set_encrypt_key:
- pushl %ebp
+_aes_hw_set_encrypt_key_base:
+L_aes_hw_set_encrypt_key_base_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
pushl %ebx
- call L093pic
-L093pic:
+ pushl %edx
+ call L093pic_for_function_hit
+L093pic_for_function_hit:
popl %ebx
- leal Lkey_const-L093pic(%ebx),%ebx
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp
+ leal _BORINGSSL_function_hit+3-L093pic_for_function_hit(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ pushl %ebx
+ call L094pic
+L094pic:
+ popl %ebx
+ leal Lkey_const-L094pic(%ebx),%ebx
movups (%eax),%xmm0
xorps %xmm4,%xmm4
- movl 4(%ebp),%ebp
leal 16(%edx),%edx
- andl $268437504,%ebp
cmpl $256,%ecx
- je L09414rounds
+ je L09514rounds
cmpl $192,%ecx
- je L09512rounds
+ je L09612rounds
cmpl $128,%ecx
- jne L096bad_keybits
+ jne L097bad_keybits
.align 4,0x90
-L09710rounds:
- cmpl $268435456,%ebp
- je L09810rounds_alt
+L09810rounds:
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
@@ -2128,13 +2139,176 @@
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L09810rounds_alt:
+L09612rounds:
+ movq 16(%eax),%xmm2
+ movl $11,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,202,1
+ call L102key_192a_cold
+.byte 102,15,58,223,202,2
+ call L103key_192b
+.byte 102,15,58,223,202,4
+ call L104key_192a
+.byte 102,15,58,223,202,8
+ call L103key_192b
+.byte 102,15,58,223,202,16
+ call L104key_192a
+.byte 102,15,58,223,202,32
+ call L103key_192b
+.byte 102,15,58,223,202,64
+ call L104key_192a
+.byte 102,15,58,223,202,128
+ call L103key_192b
+ movups %xmm0,(%edx)
+ movl %ecx,48(%edx)
+ jmp L101good_key
+.align 4,0x90
+L104key_192a:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+.align 4,0x90
+L102key_192a_cold:
+ movaps %xmm2,%xmm5
+L105key_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+.align 4,0x90
+L103key_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%edx)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%edx)
+ leal 32(%edx),%edx
+ jmp L105key_192b_warm
+.align 4,0x90
+L09514rounds:
+ movups 16(%eax),%xmm2
+ leal 16(%edx),%edx
+ movl $13,%ecx
+ movups %xmm0,-32(%edx)
+ movups %xmm2,-16(%edx)
+.byte 102,15,58,223,202,1
+ call L106key_256a_cold
+.byte 102,15,58,223,200,1
+ call L107key_256b
+.byte 102,15,58,223,202,2
+ call L108key_256a
+.byte 102,15,58,223,200,2
+ call L107key_256b
+.byte 102,15,58,223,202,4
+ call L108key_256a
+.byte 102,15,58,223,200,4
+ call L107key_256b
+.byte 102,15,58,223,202,8
+ call L108key_256a
+.byte 102,15,58,223,200,8
+ call L107key_256b
+.byte 102,15,58,223,202,16
+ call L108key_256a
+.byte 102,15,58,223,200,16
+ call L107key_256b
+.byte 102,15,58,223,202,32
+ call L108key_256a
+.byte 102,15,58,223,200,32
+ call L107key_256b
+.byte 102,15,58,223,202,64
+ call L108key_256a
+ movups %xmm0,(%edx)
+ movl %ecx,16(%edx)
+ xorl %eax,%eax
+ jmp L101good_key
+.align 4,0x90
+L108key_256a:
+ movups %xmm2,(%edx)
+ leal 16(%edx),%edx
+L106key_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 4,0x90
+L107key_256b:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ ret
+L101good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ ret
+.align 2,0x90
+L097bad_keybits:
+ pxor %xmm0,%xmm0
+ movl $-2,%eax
+ popl %ebx
+ ret
+.globl _aes_hw_set_encrypt_key_alt
+.private_extern _aes_hw_set_encrypt_key_alt
+.align 4
+_aes_hw_set_encrypt_key_alt:
+L_aes_hw_set_encrypt_key_alt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call L109pic_for_function_hit
+L109pic_for_function_hit:
+ popl %ebx
+ leal _BORINGSSL_function_hit+3-L109pic_for_function_hit(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ pushl %ebx
+ call L110pic
+L110pic:
+ popl %ebx
+ leal Lkey_const-L110pic(%ebx),%ebx
+ movups (%eax),%xmm0
+ xorps %xmm4,%xmm4
+ leal 16(%edx),%edx
+ cmpl $256,%ecx
+ je L11114rounds_alt
+ cmpl $192,%ecx
+ je L11212rounds_alt
+ cmpl $128,%ecx
+ jne L113bad_keybits
+.align 4,0x90
+L11410rounds_alt:
movdqa (%ebx),%xmm5
movl $8,%ecx
movdqa 32(%ebx),%xmm4
movdqa %xmm0,%xmm2
movdqu %xmm0,-16(%edx)
-L102loop_key128:
+L115loop_key128:
.byte 102,15,56,0,197
.byte 102,15,56,221,196
pslld $1,%xmm4
@@ -2150,7 +2324,7 @@
movdqu %xmm0,-16(%edx)
movdqa %xmm0,%xmm2
decl %ecx
- jnz L102loop_key128
+ jnz L115loop_key128
movdqa 48(%ebx),%xmm4
.byte 102,15,56,0,197
.byte 102,15,56,221,196
@@ -2178,69 +2352,15 @@
movdqu %xmm0,16(%edx)
movl $9,%ecx
movl %ecx,96(%edx)
- jmp L101good_key
+ jmp L116good_key
.align 4,0x90
-L09512rounds:
+L11212rounds_alt:
movq 16(%eax),%xmm2
- cmpl $268435456,%ebp
- je L10312rounds_alt
- movl $11,%ecx
- movups %xmm0,-16(%edx)
-.byte 102,15,58,223,202,1
- call L104key_192a_cold
-.byte 102,15,58,223,202,2
- call L105key_192b
-.byte 102,15,58,223,202,4
- call L106key_192a
-.byte 102,15,58,223,202,8
- call L105key_192b
-.byte 102,15,58,223,202,16
- call L106key_192a
-.byte 102,15,58,223,202,32
- call L105key_192b
-.byte 102,15,58,223,202,64
- call L106key_192a
-.byte 102,15,58,223,202,128
- call L105key_192b
- movups %xmm0,(%edx)
- movl %ecx,48(%edx)
- jmp L101good_key
-.align 4,0x90
-L106key_192a:
- movups %xmm0,(%edx)
- leal 16(%edx),%edx
-.align 4,0x90
-L104key_192a_cold:
- movaps %xmm2,%xmm5
-L107key_192b_warm:
- shufps $16,%xmm0,%xmm4
- movdqa %xmm2,%xmm3
- xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
- pslldq $4,%xmm3
- xorps %xmm4,%xmm0
- pshufd $85,%xmm1,%xmm1
- pxor %xmm3,%xmm2
- pxor %xmm1,%xmm0
- pshufd $255,%xmm0,%xmm3
- pxor %xmm3,%xmm2
- ret
-.align 4,0x90
-L105key_192b:
- movaps %xmm0,%xmm3
- shufps $68,%xmm0,%xmm5
- movups %xmm5,(%edx)
- shufps $78,%xmm2,%xmm3
- movups %xmm3,16(%edx)
- leal 32(%edx),%edx
- jmp L107key_192b_warm
-.align 4,0x90
-L10312rounds_alt:
movdqa 16(%ebx),%xmm5
movdqa 32(%ebx),%xmm4
movl $8,%ecx
movdqu %xmm0,-16(%edx)
-L108loop_key192:
+L117loop_key192:
movq %xmm2,(%edx)
movdqa %xmm2,%xmm1
.byte 102,15,56,0,213
@@ -2262,81 +2382,21 @@
pxor %xmm3,%xmm2
movdqu %xmm0,-16(%edx)
decl %ecx
- jnz L108loop_key192
+ jnz L117loop_key192
movl $11,%ecx
movl %ecx,32(%edx)
- jmp L101good_key
+ jmp L116good_key
.align 4,0x90
-L09414rounds:
+L11114rounds_alt:
movups 16(%eax),%xmm2
leal 16(%edx),%edx
- cmpl $268435456,%ebp
- je L10914rounds_alt
- movl $13,%ecx
- movups %xmm0,-32(%edx)
- movups %xmm2,-16(%edx)
-.byte 102,15,58,223,202,1
- call L110key_256a_cold
-.byte 102,15,58,223,200,1
- call L111key_256b
-.byte 102,15,58,223,202,2
- call L112key_256a
-.byte 102,15,58,223,200,2
- call L111key_256b
-.byte 102,15,58,223,202,4
- call L112key_256a
-.byte 102,15,58,223,200,4
- call L111key_256b
-.byte 102,15,58,223,202,8
- call L112key_256a
-.byte 102,15,58,223,200,8
- call L111key_256b
-.byte 102,15,58,223,202,16
- call L112key_256a
-.byte 102,15,58,223,200,16
- call L111key_256b
-.byte 102,15,58,223,202,32
- call L112key_256a
-.byte 102,15,58,223,200,32
- call L111key_256b
-.byte 102,15,58,223,202,64
- call L112key_256a
- movups %xmm0,(%edx)
- movl %ecx,16(%edx)
- xorl %eax,%eax
- jmp L101good_key
-.align 4,0x90
-L112key_256a:
- movups %xmm2,(%edx)
- leal 16(%edx),%edx
-L110key_256a_cold:
- shufps $16,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
- xorps %xmm1,%xmm0
- ret
-.align 4,0x90
-L111key_256b:
- movups %xmm0,(%edx)
- leal 16(%edx),%edx
- shufps $16,%xmm2,%xmm4
- xorps %xmm4,%xmm2
- shufps $140,%xmm2,%xmm4
- xorps %xmm4,%xmm2
- shufps $170,%xmm1,%xmm1
- xorps %xmm1,%xmm2
- ret
-.align 4,0x90
-L10914rounds_alt:
movdqa (%ebx),%xmm5
movdqa 32(%ebx),%xmm4
movl $7,%ecx
movdqu %xmm0,-32(%edx)
movdqa %xmm2,%xmm1
movdqu %xmm2,-16(%edx)
-L113loop_key256:
+L118loop_key256:
.byte 102,15,56,0,213
.byte 102,15,56,221,212
movdqa %xmm0,%xmm3
@@ -2350,7 +2410,7 @@
pxor %xmm2,%xmm0
movdqu %xmm0,(%edx)
decl %ecx
- jz L114done_key256
+ jz L119done_key256
pshufd $255,%xmm0,%xmm2
pxor %xmm3,%xmm3
.byte 102,15,56,221,211
@@ -2365,11 +2425,11 @@
movdqu %xmm2,16(%edx)
leal 32(%edx),%edx
movdqa %xmm2,%xmm1
- jmp L113loop_key256
-L114done_key256:
+ jmp L118loop_key256
+L119done_key256:
movl $13,%ecx
movl %ecx,16(%edx)
-L101good_key:
+L116good_key:
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
@@ -2378,36 +2438,12 @@
pxor %xmm5,%xmm5
xorl %eax,%eax
popl %ebx
- popl %ebp
ret
.align 2,0x90
-L096bad_keybits:
+L113bad_keybits:
pxor %xmm0,%xmm0
movl $-2,%eax
popl %ebx
- popl %ebp
- ret
-.globl _aes_hw_set_encrypt_key
-.private_extern _aes_hw_set_encrypt_key
-.align 4
-_aes_hw_set_encrypt_key:
-L_aes_hw_set_encrypt_key_begin:
-#ifdef BORINGSSL_DISPATCH_TEST
- pushl %ebx
- pushl %edx
- call L115pic
-L115pic:
- popl %ebx
- leal _BORINGSSL_function_hit+3-L115pic(%ebx),%ebx
- movl $1,%edx
- movb %dl,(%ebx)
- popl %edx
- popl %ebx
-#endif
- movl 4(%esp),%eax
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- call __aesni_set_encrypt_key
ret
.globl _aes_hw_encrypt_key_to_decrypt_key
.private_extern _aes_hw_encrypt_key_to_decrypt_key
@@ -2424,7 +2460,7 @@
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-L116dec_key_inverse:
+L120dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2434,7 +2470,7 @@
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja L116dec_key_inverse
+ ja L120dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
@@ -2451,8 +2487,4 @@
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol _OPENSSL_ia32cap_P
-.long 0
#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/aesni-x86-linux.S b/gen/bcm/aesni-x86-linux.S
index 1f15c71..6fefe17 100644
--- a/gen/bcm/aesni-x86-linux.S
+++ b/gen/bcm/aesni-x86-linux.S
@@ -16,10 +16,10 @@
#ifdef BORINGSSL_DISPATCH_TEST
pushl %ebx
pushl %edx
- call .L000pic
-.L000pic:
+ call .L000pic_for_function_hit
+.L000pic_for_function_hit:
popl %ebx
- leal BORINGSSL_function_hit+1-.L000pic(%ebx),%ebx
+ leal BORINGSSL_function_hit+1-.L000pic_for_function_hit(%ebx),%ebx
movl $1,%edx
movb %dl,(%ebx)
popl %edx
@@ -847,10 +847,10 @@
#ifdef BORINGSSL_DISPATCH_TEST
pushl %ebx
pushl %edx
- call .L038pic
-.L038pic:
+ call .L038pic_for_function_hit
+.L038pic_for_function_hit:
popl %ebx
- leal BORINGSSL_function_hit+0-.L038pic(%ebx),%ebx
+ leal BORINGSSL_function_hit+0-.L038pic_for_function_hit(%ebx),%ebx
movl $1,%edx
movb %dl,(%ebx)
popl %edx
@@ -2099,32 +2099,43 @@
popl %ebp
ret
.size aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin
-.hidden _aesni_set_encrypt_key
-.type _aesni_set_encrypt_key,@function
+.globl aes_hw_set_encrypt_key_base
+.hidden aes_hw_set_encrypt_key_base
+.type aes_hw_set_encrypt_key_base,@function
.align 16
-_aesni_set_encrypt_key:
- pushl %ebp
+aes_hw_set_encrypt_key_base:
+.L_aes_hw_set_encrypt_key_base_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
pushl %ebx
- call .L093pic
-.L093pic:
+ pushl %edx
+ call .L093pic_for_function_hit
+.L093pic_for_function_hit:
popl %ebx
- leal .Lkey_const-.L093pic(%ebx),%ebx
- leal OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp
+ leal BORINGSSL_function_hit+3-.L093pic_for_function_hit(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ pushl %ebx
+ call .L094pic
+.L094pic:
+ popl %ebx
+ leal .Lkey_const-.L094pic(%ebx),%ebx
movups (%eax),%xmm0
xorps %xmm4,%xmm4
- movl 4(%ebp),%ebp
leal 16(%edx),%edx
- andl $268437504,%ebp
cmpl $256,%ecx
- je .L09414rounds
+ je .L09514rounds
cmpl $192,%ecx
- je .L09512rounds
+ je .L09612rounds
cmpl $128,%ecx
- jne .L096bad_keybits
+ jne .L097bad_keybits
.align 16
-.L09710rounds:
- cmpl $268435456,%ebp
- je .L09810rounds_alt
+.L09810rounds:
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
@@ -2163,13 +2174,178 @@
xorps %xmm1,%xmm0
ret
.align 16
-.L09810rounds_alt:
+.L09612rounds:
+ movq 16(%eax),%xmm2
+ movl $11,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,202,1
+ call .L102key_192a_cold
+.byte 102,15,58,223,202,2
+ call .L103key_192b
+.byte 102,15,58,223,202,4
+ call .L104key_192a
+.byte 102,15,58,223,202,8
+ call .L103key_192b
+.byte 102,15,58,223,202,16
+ call .L104key_192a
+.byte 102,15,58,223,202,32
+ call .L103key_192b
+.byte 102,15,58,223,202,64
+ call .L104key_192a
+.byte 102,15,58,223,202,128
+ call .L103key_192b
+ movups %xmm0,(%edx)
+ movl %ecx,48(%edx)
+ jmp .L101good_key
+.align 16
+.L104key_192a:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+.align 16
+.L102key_192a_cold:
+ movaps %xmm2,%xmm5
+.L105key_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+.align 16
+.L103key_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%edx)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%edx)
+ leal 32(%edx),%edx
+ jmp .L105key_192b_warm
+.align 16
+.L09514rounds:
+ movups 16(%eax),%xmm2
+ leal 16(%edx),%edx
+ movl $13,%ecx
+ movups %xmm0,-32(%edx)
+ movups %xmm2,-16(%edx)
+.byte 102,15,58,223,202,1
+ call .L106key_256a_cold
+.byte 102,15,58,223,200,1
+ call .L107key_256b
+.byte 102,15,58,223,202,2
+ call .L108key_256a
+.byte 102,15,58,223,200,2
+ call .L107key_256b
+.byte 102,15,58,223,202,4
+ call .L108key_256a
+.byte 102,15,58,223,200,4
+ call .L107key_256b
+.byte 102,15,58,223,202,8
+ call .L108key_256a
+.byte 102,15,58,223,200,8
+ call .L107key_256b
+.byte 102,15,58,223,202,16
+ call .L108key_256a
+.byte 102,15,58,223,200,16
+ call .L107key_256b
+.byte 102,15,58,223,202,32
+ call .L108key_256a
+.byte 102,15,58,223,200,32
+ call .L107key_256b
+.byte 102,15,58,223,202,64
+ call .L108key_256a
+ movups %xmm0,(%edx)
+ movl %ecx,16(%edx)
+ xorl %eax,%eax
+ jmp .L101good_key
+.align 16
+.L108key_256a:
+ movups %xmm2,(%edx)
+ leal 16(%edx),%edx
+.L106key_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 16
+.L107key_256b:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ ret
+.L101good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ ret
+.align 4
+.L097bad_keybits:
+ pxor %xmm0,%xmm0
+ movl $-2,%eax
+ popl %ebx
+ ret
+.size aes_hw_set_encrypt_key_base,.-.L_aes_hw_set_encrypt_key_base_begin
+.globl aes_hw_set_encrypt_key_alt
+.hidden aes_hw_set_encrypt_key_alt
+.type aes_hw_set_encrypt_key_alt,@function
+.align 16
+aes_hw_set_encrypt_key_alt:
+.L_aes_hw_set_encrypt_key_alt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call .L109pic_for_function_hit
+.L109pic_for_function_hit:
+ popl %ebx
+ leal BORINGSSL_function_hit+3-.L109pic_for_function_hit(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ pushl %ebx
+ call .L110pic
+.L110pic:
+ popl %ebx
+ leal .Lkey_const-.L110pic(%ebx),%ebx
+ movups (%eax),%xmm0
+ xorps %xmm4,%xmm4
+ leal 16(%edx),%edx
+ cmpl $256,%ecx
+ je .L11114rounds_alt
+ cmpl $192,%ecx
+ je .L11212rounds_alt
+ cmpl $128,%ecx
+ jne .L113bad_keybits
+.align 16
+.L11410rounds_alt:
movdqa (%ebx),%xmm5
movl $8,%ecx
movdqa 32(%ebx),%xmm4
movdqa %xmm0,%xmm2
movdqu %xmm0,-16(%edx)
-.L102loop_key128:
+.L115loop_key128:
.byte 102,15,56,0,197
.byte 102,15,56,221,196
pslld $1,%xmm4
@@ -2185,7 +2361,7 @@
movdqu %xmm0,-16(%edx)
movdqa %xmm0,%xmm2
decl %ecx
- jnz .L102loop_key128
+ jnz .L115loop_key128
movdqa 48(%ebx),%xmm4
.byte 102,15,56,0,197
.byte 102,15,56,221,196
@@ -2213,69 +2389,15 @@
movdqu %xmm0,16(%edx)
movl $9,%ecx
movl %ecx,96(%edx)
- jmp .L101good_key
+ jmp .L116good_key
.align 16
-.L09512rounds:
+.L11212rounds_alt:
movq 16(%eax),%xmm2
- cmpl $268435456,%ebp
- je .L10312rounds_alt
- movl $11,%ecx
- movups %xmm0,-16(%edx)
-.byte 102,15,58,223,202,1
- call .L104key_192a_cold
-.byte 102,15,58,223,202,2
- call .L105key_192b
-.byte 102,15,58,223,202,4
- call .L106key_192a
-.byte 102,15,58,223,202,8
- call .L105key_192b
-.byte 102,15,58,223,202,16
- call .L106key_192a
-.byte 102,15,58,223,202,32
- call .L105key_192b
-.byte 102,15,58,223,202,64
- call .L106key_192a
-.byte 102,15,58,223,202,128
- call .L105key_192b
- movups %xmm0,(%edx)
- movl %ecx,48(%edx)
- jmp .L101good_key
-.align 16
-.L106key_192a:
- movups %xmm0,(%edx)
- leal 16(%edx),%edx
-.align 16
-.L104key_192a_cold:
- movaps %xmm2,%xmm5
-.L107key_192b_warm:
- shufps $16,%xmm0,%xmm4
- movdqa %xmm2,%xmm3
- xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
- pslldq $4,%xmm3
- xorps %xmm4,%xmm0
- pshufd $85,%xmm1,%xmm1
- pxor %xmm3,%xmm2
- pxor %xmm1,%xmm0
- pshufd $255,%xmm0,%xmm3
- pxor %xmm3,%xmm2
- ret
-.align 16
-.L105key_192b:
- movaps %xmm0,%xmm3
- shufps $68,%xmm0,%xmm5
- movups %xmm5,(%edx)
- shufps $78,%xmm2,%xmm3
- movups %xmm3,16(%edx)
- leal 32(%edx),%edx
- jmp .L107key_192b_warm
-.align 16
-.L10312rounds_alt:
movdqa 16(%ebx),%xmm5
movdqa 32(%ebx),%xmm4
movl $8,%ecx
movdqu %xmm0,-16(%edx)
-.L108loop_key192:
+.L117loop_key192:
movq %xmm2,(%edx)
movdqa %xmm2,%xmm1
.byte 102,15,56,0,213
@@ -2297,81 +2419,21 @@
pxor %xmm3,%xmm2
movdqu %xmm0,-16(%edx)
decl %ecx
- jnz .L108loop_key192
+ jnz .L117loop_key192
movl $11,%ecx
movl %ecx,32(%edx)
- jmp .L101good_key
+ jmp .L116good_key
.align 16
-.L09414rounds:
+.L11114rounds_alt:
movups 16(%eax),%xmm2
leal 16(%edx),%edx
- cmpl $268435456,%ebp
- je .L10914rounds_alt
- movl $13,%ecx
- movups %xmm0,-32(%edx)
- movups %xmm2,-16(%edx)
-.byte 102,15,58,223,202,1
- call .L110key_256a_cold
-.byte 102,15,58,223,200,1
- call .L111key_256b
-.byte 102,15,58,223,202,2
- call .L112key_256a
-.byte 102,15,58,223,200,2
- call .L111key_256b
-.byte 102,15,58,223,202,4
- call .L112key_256a
-.byte 102,15,58,223,200,4
- call .L111key_256b
-.byte 102,15,58,223,202,8
- call .L112key_256a
-.byte 102,15,58,223,200,8
- call .L111key_256b
-.byte 102,15,58,223,202,16
- call .L112key_256a
-.byte 102,15,58,223,200,16
- call .L111key_256b
-.byte 102,15,58,223,202,32
- call .L112key_256a
-.byte 102,15,58,223,200,32
- call .L111key_256b
-.byte 102,15,58,223,202,64
- call .L112key_256a
- movups %xmm0,(%edx)
- movl %ecx,16(%edx)
- xorl %eax,%eax
- jmp .L101good_key
-.align 16
-.L112key_256a:
- movups %xmm2,(%edx)
- leal 16(%edx),%edx
-.L110key_256a_cold:
- shufps $16,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
- xorps %xmm1,%xmm0
- ret
-.align 16
-.L111key_256b:
- movups %xmm0,(%edx)
- leal 16(%edx),%edx
- shufps $16,%xmm2,%xmm4
- xorps %xmm4,%xmm2
- shufps $140,%xmm2,%xmm4
- xorps %xmm4,%xmm2
- shufps $170,%xmm1,%xmm1
- xorps %xmm1,%xmm2
- ret
-.align 16
-.L10914rounds_alt:
movdqa (%ebx),%xmm5
movdqa 32(%ebx),%xmm4
movl $7,%ecx
movdqu %xmm0,-32(%edx)
movdqa %xmm2,%xmm1
movdqu %xmm2,-16(%edx)
-.L113loop_key256:
+.L118loop_key256:
.byte 102,15,56,0,213
.byte 102,15,56,221,212
movdqa %xmm0,%xmm3
@@ -2385,7 +2447,7 @@
pxor %xmm2,%xmm0
movdqu %xmm0,(%edx)
decl %ecx
- jz .L114done_key256
+ jz .L119done_key256
pshufd $255,%xmm0,%xmm2
pxor %xmm3,%xmm3
.byte 102,15,56,221,211
@@ -2400,11 +2462,11 @@
movdqu %xmm2,16(%edx)
leal 32(%edx),%edx
movdqa %xmm2,%xmm1
- jmp .L113loop_key256
-.L114done_key256:
+ jmp .L118loop_key256
+.L119done_key256:
movl $13,%ecx
movl %ecx,16(%edx)
-.L101good_key:
+.L116good_key:
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
@@ -2413,40 +2475,14 @@
pxor %xmm5,%xmm5
xorl %eax,%eax
popl %ebx
- popl %ebp
ret
.align 4
-.L096bad_keybits:
+.L113bad_keybits:
pxor %xmm0,%xmm0
movl $-2,%eax
popl %ebx
- popl %ebp
ret
-.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
-.globl aes_hw_set_encrypt_key
-.hidden aes_hw_set_encrypt_key
-.type aes_hw_set_encrypt_key,@function
-.align 16
-aes_hw_set_encrypt_key:
-.L_aes_hw_set_encrypt_key_begin:
-#ifdef BORINGSSL_DISPATCH_TEST
- pushl %ebx
- pushl %edx
- call .L115pic
-.L115pic:
- popl %ebx
- leal BORINGSSL_function_hit+3-.L115pic(%ebx),%ebx
- movl $1,%edx
- movb %dl,(%ebx)
- popl %edx
- popl %ebx
-#endif
- movl 4(%esp),%eax
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- call _aesni_set_encrypt_key
- ret
-.size aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin
+.size aes_hw_set_encrypt_key_alt,.-.L_aes_hw_set_encrypt_key_alt_begin
.globl aes_hw_encrypt_key_to_decrypt_key
.hidden aes_hw_encrypt_key_to_decrypt_key
.type aes_hw_encrypt_key_to_decrypt_key,@function
@@ -2463,7 +2499,7 @@
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-.L116dec_key_inverse:
+.L120dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2473,7 +2509,7 @@
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja .L116dec_key_inverse
+ ja .L120dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
diff --git a/gen/bcm/aesni-x86-win.asm b/gen/bcm/aesni-x86-win.asm
index 2d32e77..4453afb 100644
--- a/gen/bcm/aesni-x86-win.asm
+++ b/gen/bcm/aesni-x86-win.asm
@@ -13,7 +13,6 @@
%else
section .text code
%endif
-;extern _OPENSSL_ia32cap_P
%ifdef BORINGSSL_DISPATCH_TEST
extern _BORINGSSL_function_hit
%endif
@@ -24,10 +23,10 @@
%ifdef BORINGSSL_DISPATCH_TEST
push ebx
push edx
- call L$000pic
-L$000pic:
+ call L$000pic_for_function_hit
+L$000pic_for_function_hit:
pop ebx
- lea ebx,[(_BORINGSSL_function_hit+1-L$000pic)+ebx]
+ lea ebx,[(_BORINGSSL_function_hit+1-L$000pic_for_function_hit)+ebx]
mov edx,1
mov BYTE [ebx],dl
pop edx
@@ -816,10 +815,10 @@
%ifdef BORINGSSL_DISPATCH_TEST
push ebx
push edx
- call L$038pic
-L$038pic:
+ call L$038pic_for_function_hit
+L$038pic_for_function_hit:
pop ebx
- lea ebx,[(_BORINGSSL_function_hit+0-L$038pic)+ebx]
+ lea ebx,[(_BORINGSSL_function_hit+0-L$038pic_for_function_hit)+ebx]
mov edx,1
mov BYTE [ebx],dl
pop edx
@@ -2058,30 +2057,41 @@
pop ebx
pop ebp
ret
+global _aes_hw_set_encrypt_key_base
align 16
-__aesni_set_encrypt_key:
- push ebp
+_aes_hw_set_encrypt_key_base:
+L$_aes_hw_set_encrypt_key_base_begin:
+%ifdef BORINGSSL_DISPATCH_TEST
push ebx
- call L$093pic
-L$093pic:
+ push edx
+ call L$093pic_for_function_hit
+L$093pic_for_function_hit:
pop ebx
- lea ebx,[(L$key_const-L$093pic)+ebx]
- lea ebp,[_OPENSSL_ia32cap_P]
+ lea ebx,[(_BORINGSSL_function_hit+3-L$093pic_for_function_hit)+ebx]
+ mov edx,1
+ mov BYTE [ebx],dl
+ pop edx
+ pop ebx
+%endif
+ mov eax,DWORD [4+esp]
+ mov ecx,DWORD [8+esp]
+ mov edx,DWORD [12+esp]
+ push ebx
+ call L$094pic
+L$094pic:
+ pop ebx
+ lea ebx,[(L$key_const-L$094pic)+ebx]
movups xmm0,[eax]
xorps xmm4,xmm4
- mov ebp,DWORD [4+ebp]
lea edx,[16+edx]
- and ebp,268437504
cmp ecx,256
- je NEAR L$09414rounds
+ je NEAR L$09514rounds
cmp ecx,192
- je NEAR L$09512rounds
+ je NEAR L$09612rounds
cmp ecx,128
- jne NEAR L$096bad_keybits
+ jne NEAR L$097bad_keybits
align 16
-L$09710rounds:
- cmp ebp,268435456
- je NEAR L$09810rounds_alt
+L$09810rounds:
mov ecx,9
movups [edx-16],xmm0
db 102,15,58,223,200,1
@@ -2120,13 +2130,175 @@
xorps xmm0,xmm1
ret
align 16
-L$09810rounds_alt:
+L$09612rounds:
+ movq xmm2,[16+eax]
+ mov ecx,11
+ movups [edx-16],xmm0
+db 102,15,58,223,202,1
+ call L$102key_192a_cold
+db 102,15,58,223,202,2
+ call L$103key_192b
+db 102,15,58,223,202,4
+ call L$104key_192a
+db 102,15,58,223,202,8
+ call L$103key_192b
+db 102,15,58,223,202,16
+ call L$104key_192a
+db 102,15,58,223,202,32
+ call L$103key_192b
+db 102,15,58,223,202,64
+ call L$104key_192a
+db 102,15,58,223,202,128
+ call L$103key_192b
+ movups [edx],xmm0
+ mov DWORD [48+edx],ecx
+ jmp NEAR L$101good_key
+align 16
+L$104key_192a:
+ movups [edx],xmm0
+ lea edx,[16+edx]
+align 16
+L$102key_192a_cold:
+ movaps xmm5,xmm2
+L$105key_192b_warm:
+ shufps xmm4,xmm0,16
+ movdqa xmm3,xmm2
+ xorps xmm0,xmm4
+ shufps xmm4,xmm0,140
+ pslldq xmm3,4
+ xorps xmm0,xmm4
+ pshufd xmm1,xmm1,85
+ pxor xmm2,xmm3
+ pxor xmm0,xmm1
+ pshufd xmm3,xmm0,255
+ pxor xmm2,xmm3
+ ret
+align 16
+L$103key_192b:
+ movaps xmm3,xmm0
+ shufps xmm5,xmm0,68
+ movups [edx],xmm5
+ shufps xmm3,xmm2,78
+ movups [16+edx],xmm3
+ lea edx,[32+edx]
+ jmp NEAR L$105key_192b_warm
+align 16
+L$09514rounds:
+ movups xmm2,[16+eax]
+ lea edx,[16+edx]
+ mov ecx,13
+ movups [edx-32],xmm0
+ movups [edx-16],xmm2
+db 102,15,58,223,202,1
+ call L$106key_256a_cold
+db 102,15,58,223,200,1
+ call L$107key_256b
+db 102,15,58,223,202,2
+ call L$108key_256a
+db 102,15,58,223,200,2
+ call L$107key_256b
+db 102,15,58,223,202,4
+ call L$108key_256a
+db 102,15,58,223,200,4
+ call L$107key_256b
+db 102,15,58,223,202,8
+ call L$108key_256a
+db 102,15,58,223,200,8
+ call L$107key_256b
+db 102,15,58,223,202,16
+ call L$108key_256a
+db 102,15,58,223,200,16
+ call L$107key_256b
+db 102,15,58,223,202,32
+ call L$108key_256a
+db 102,15,58,223,200,32
+ call L$107key_256b
+db 102,15,58,223,202,64
+ call L$108key_256a
+ movups [edx],xmm0
+ mov DWORD [16+edx],ecx
+ xor eax,eax
+ jmp NEAR L$101good_key
+align 16
+L$108key_256a:
+ movups [edx],xmm2
+ lea edx,[16+edx]
+L$106key_256a_cold:
+ shufps xmm4,xmm0,16
+ xorps xmm0,xmm4
+ shufps xmm4,xmm0,140
+ xorps xmm0,xmm4
+ shufps xmm1,xmm1,255
+ xorps xmm0,xmm1
+ ret
+align 16
+L$107key_256b:
+ movups [edx],xmm0
+ lea edx,[16+edx]
+ shufps xmm4,xmm2,16
+ xorps xmm2,xmm4
+ shufps xmm4,xmm2,140
+ xorps xmm2,xmm4
+ shufps xmm1,xmm1,170
+ xorps xmm2,xmm1
+ ret
+L$101good_key:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ xor eax,eax
+ pop ebx
+ ret
+align 4
+L$097bad_keybits:
+ pxor xmm0,xmm0
+ mov eax,-2
+ pop ebx
+ ret
+global _aes_hw_set_encrypt_key_alt
+align 16
+_aes_hw_set_encrypt_key_alt:
+L$_aes_hw_set_encrypt_key_alt_begin:
+%ifdef BORINGSSL_DISPATCH_TEST
+ push ebx
+ push edx
+ call L$109pic_for_function_hit
+L$109pic_for_function_hit:
+ pop ebx
+ lea ebx,[(_BORINGSSL_function_hit+3-L$109pic_for_function_hit)+ebx]
+ mov edx,1
+ mov BYTE [ebx],dl
+ pop edx
+ pop ebx
+%endif
+ mov eax,DWORD [4+esp]
+ mov ecx,DWORD [8+esp]
+ mov edx,DWORD [12+esp]
+ push ebx
+ call L$110pic
+L$110pic:
+ pop ebx
+ lea ebx,[(L$key_const-L$110pic)+ebx]
+ movups xmm0,[eax]
+ xorps xmm4,xmm4
+ lea edx,[16+edx]
+ cmp ecx,256
+ je NEAR L$11114rounds_alt
+ cmp ecx,192
+ je NEAR L$11212rounds_alt
+ cmp ecx,128
+ jne NEAR L$113bad_keybits
+align 16
+L$11410rounds_alt:
movdqa xmm5,[ebx]
mov ecx,8
movdqa xmm4,[32+ebx]
movdqa xmm2,xmm0
movdqu [edx-16],xmm0
-L$102loop_key128:
+L$115loop_key128:
db 102,15,56,0,197
db 102,15,56,221,196
pslld xmm4,1
@@ -2142,7 +2314,7 @@
movdqu [edx-16],xmm0
movdqa xmm2,xmm0
dec ecx
- jnz NEAR L$102loop_key128
+ jnz NEAR L$115loop_key128
movdqa xmm4,[48+ebx]
db 102,15,56,0,197
db 102,15,56,221,196
@@ -2170,69 +2342,15 @@
movdqu [16+edx],xmm0
mov ecx,9
mov DWORD [96+edx],ecx
- jmp NEAR L$101good_key
+ jmp NEAR L$116good_key
align 16
-L$09512rounds:
+L$11212rounds_alt:
movq xmm2,[16+eax]
- cmp ebp,268435456
- je NEAR L$10312rounds_alt
- mov ecx,11
- movups [edx-16],xmm0
-db 102,15,58,223,202,1
- call L$104key_192a_cold
-db 102,15,58,223,202,2
- call L$105key_192b
-db 102,15,58,223,202,4
- call L$106key_192a
-db 102,15,58,223,202,8
- call L$105key_192b
-db 102,15,58,223,202,16
- call L$106key_192a
-db 102,15,58,223,202,32
- call L$105key_192b
-db 102,15,58,223,202,64
- call L$106key_192a
-db 102,15,58,223,202,128
- call L$105key_192b
- movups [edx],xmm0
- mov DWORD [48+edx],ecx
- jmp NEAR L$101good_key
-align 16
-L$106key_192a:
- movups [edx],xmm0
- lea edx,[16+edx]
-align 16
-L$104key_192a_cold:
- movaps xmm5,xmm2
-L$107key_192b_warm:
- shufps xmm4,xmm0,16
- movdqa xmm3,xmm2
- xorps xmm0,xmm4
- shufps xmm4,xmm0,140
- pslldq xmm3,4
- xorps xmm0,xmm4
- pshufd xmm1,xmm1,85
- pxor xmm2,xmm3
- pxor xmm0,xmm1
- pshufd xmm3,xmm0,255
- pxor xmm2,xmm3
- ret
-align 16
-L$105key_192b:
- movaps xmm3,xmm0
- shufps xmm5,xmm0,68
- movups [edx],xmm5
- shufps xmm3,xmm2,78
- movups [16+edx],xmm3
- lea edx,[32+edx]
- jmp NEAR L$107key_192b_warm
-align 16
-L$10312rounds_alt:
movdqa xmm5,[16+ebx]
movdqa xmm4,[32+ebx]
mov ecx,8
movdqu [edx-16],xmm0
-L$108loop_key192:
+L$117loop_key192:
movq [edx],xmm2
movdqa xmm1,xmm2
db 102,15,56,0,213
@@ -2254,81 +2372,21 @@
pxor xmm2,xmm3
movdqu [edx-16],xmm0
dec ecx
- jnz NEAR L$108loop_key192
+ jnz NEAR L$117loop_key192
mov ecx,11
mov DWORD [32+edx],ecx
- jmp NEAR L$101good_key
+ jmp NEAR L$116good_key
align 16
-L$09414rounds:
+L$11114rounds_alt:
movups xmm2,[16+eax]
lea edx,[16+edx]
- cmp ebp,268435456
- je NEAR L$10914rounds_alt
- mov ecx,13
- movups [edx-32],xmm0
- movups [edx-16],xmm2
-db 102,15,58,223,202,1
- call L$110key_256a_cold
-db 102,15,58,223,200,1
- call L$111key_256b
-db 102,15,58,223,202,2
- call L$112key_256a
-db 102,15,58,223,200,2
- call L$111key_256b
-db 102,15,58,223,202,4
- call L$112key_256a
-db 102,15,58,223,200,4
- call L$111key_256b
-db 102,15,58,223,202,8
- call L$112key_256a
-db 102,15,58,223,200,8
- call L$111key_256b
-db 102,15,58,223,202,16
- call L$112key_256a
-db 102,15,58,223,200,16
- call L$111key_256b
-db 102,15,58,223,202,32
- call L$112key_256a
-db 102,15,58,223,200,32
- call L$111key_256b
-db 102,15,58,223,202,64
- call L$112key_256a
- movups [edx],xmm0
- mov DWORD [16+edx],ecx
- xor eax,eax
- jmp NEAR L$101good_key
-align 16
-L$112key_256a:
- movups [edx],xmm2
- lea edx,[16+edx]
-L$110key_256a_cold:
- shufps xmm4,xmm0,16
- xorps xmm0,xmm4
- shufps xmm4,xmm0,140
- xorps xmm0,xmm4
- shufps xmm1,xmm1,255
- xorps xmm0,xmm1
- ret
-align 16
-L$111key_256b:
- movups [edx],xmm0
- lea edx,[16+edx]
- shufps xmm4,xmm2,16
- xorps xmm2,xmm4
- shufps xmm4,xmm2,140
- xorps xmm2,xmm4
- shufps xmm1,xmm1,170
- xorps xmm2,xmm1
- ret
-align 16
-L$10914rounds_alt:
movdqa xmm5,[ebx]
movdqa xmm4,[32+ebx]
mov ecx,7
movdqu [edx-32],xmm0
movdqa xmm1,xmm2
movdqu [edx-16],xmm2
-L$113loop_key256:
+L$118loop_key256:
db 102,15,56,0,213
db 102,15,56,221,212
movdqa xmm3,xmm0
@@ -2342,7 +2400,7 @@
pxor xmm0,xmm2
movdqu [edx],xmm0
dec ecx
- jz NEAR L$114done_key256
+ jz NEAR L$119done_key256
pshufd xmm2,xmm0,255
pxor xmm3,xmm3
db 102,15,56,221,211
@@ -2357,11 +2415,11 @@
movdqu [16+edx],xmm2
lea edx,[32+edx]
movdqa xmm1,xmm2
- jmp NEAR L$113loop_key256
-L$114done_key256:
+ jmp NEAR L$118loop_key256
+L$119done_key256:
mov ecx,13
mov DWORD [16+edx],ecx
-L$101good_key:
+L$116good_key:
pxor xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
@@ -2370,35 +2428,12 @@
pxor xmm5,xmm5
xor eax,eax
pop ebx
- pop ebp
ret
align 4
-L$096bad_keybits:
+L$113bad_keybits:
pxor xmm0,xmm0
mov eax,-2
pop ebx
- pop ebp
- ret
-global _aes_hw_set_encrypt_key
-align 16
-_aes_hw_set_encrypt_key:
-L$_aes_hw_set_encrypt_key_begin:
-%ifdef BORINGSSL_DISPATCH_TEST
- push ebx
- push edx
- call L$115pic
-L$115pic:
- pop ebx
- lea ebx,[(_BORINGSSL_function_hit+3-L$115pic)+ebx]
- mov edx,1
- mov BYTE [ebx],dl
- pop edx
- pop ebx
-%endif
- mov eax,DWORD [4+esp]
- mov ecx,DWORD [8+esp]
- mov edx,DWORD [12+esp]
- call __aesni_set_encrypt_key
ret
global _aes_hw_encrypt_key_to_decrypt_key
align 16
@@ -2414,7 +2449,7 @@
movups [edx],xmm1
lea edx,[16+edx]
lea eax,[eax-16]
-L$116dec_key_inverse:
+L$120dec_key_inverse:
movups xmm0,[edx]
movups xmm1,[eax]
db 102,15,56,219,192
@@ -2424,7 +2459,7 @@
movups [16+eax],xmm0
movups [edx-16],xmm1
cmp eax,edx
- ja NEAR L$116dec_key_inverse
+ ja NEAR L$120dec_key_inverse
movups xmm0,[edx]
db 102,15,56,219,192
movups [edx],xmm0
@@ -2441,8 +2476,6 @@
db 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
db 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
db 115,108,46,111,114,103,62,0
-segment .bss
-common _OPENSSL_ia32cap_P 16
%else
; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
ret
diff --git a/gen/bcm/aesni-x86_64-apple.S b/gen/bcm/aesni-x86_64-apple.S
index ccf9f8f..23c15c3 100644
--- a/gen/bcm/aesni-x86_64-apple.S
+++ b/gen/bcm/aesni-x86_64-apple.S
@@ -5,7 +5,6 @@
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
.text
-
.globl _aes_hw_encrypt
.private_extern _aes_hw_encrypt
@@ -1945,11 +1944,11 @@
ret
-.globl _aes_hw_set_encrypt_key
-.private_extern _aes_hw_set_encrypt_key
+.globl _aes_hw_set_encrypt_key_base
+.private_extern _aes_hw_set_encrypt_key_base
.p2align 4
-_aes_hw_set_encrypt_key:
+_aes_hw_set_encrypt_key_base:
_CET_ENDBR
@@ -1962,9 +1961,6 @@
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
- leaq _OPENSSL_ia32cap_P(%rip),%r10
- movl 4(%r10),%r10d
- andl $268437504,%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je L$14rounds
@@ -1975,8 +1971,6 @@
L$10rounds:
movl $9,%esi
- cmpl $268435456,%r10d
- je L$10rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
@@ -2005,78 +1999,9 @@
jmp L$enc_key_ret
.p2align 4
-L$10rounds_alt:
- movdqa L$key_rotate(%rip),%xmm5
- movl $8,%r10d
- movdqa L$key_rcon1(%rip),%xmm4
- movdqa %xmm0,%xmm2
- movdqu %xmm0,(%rdx)
- jmp L$oop_key128
-
-.p2align 4
-L$oop_key128:
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
- pslld $1,%xmm4
- leaq 16(%rax),%rax
-
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,-16(%rax)
- movdqa %xmm0,%xmm2
-
- decl %r10d
- jnz L$oop_key128
-
- movdqa L$key_rcon1b(%rip),%xmm4
-
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
- pslld $1,%xmm4
-
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,(%rax)
-
- movdqa %xmm0,%xmm2
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
-
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,16(%rax)
-
- movl %esi,96(%rax)
- xorl %eax,%eax
- jmp L$enc_key_ret
-
-.p2align 4
L$12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
- cmpl $268435456,%r10d
- je L$12rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
@@ -2101,53 +2026,10 @@
jmp L$enc_key_ret
.p2align 4
-L$12rounds_alt:
- movdqa L$key_rotate192(%rip),%xmm5
- movdqa L$key_rcon1(%rip),%xmm4
- movl $8,%r10d
- movdqu %xmm0,(%rdx)
- jmp L$oop_key192
-
-.p2align 4
-L$oop_key192:
- movq %xmm2,0(%rax)
- movdqa %xmm2,%xmm1
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
- pslld $1,%xmm4
- leaq 24(%rax),%rax
-
- movdqa %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm3,%xmm0
-
- pshufd $0xff,%xmm0,%xmm3
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
-
- pxor %xmm2,%xmm0
- pxor %xmm3,%xmm2
- movdqu %xmm0,-16(%rax)
-
- decl %r10d
- jnz L$oop_key192
-
- movl %esi,32(%rax)
- xorl %eax,%eax
- jmp L$enc_key_ret
-
-.p2align 4
L$14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
- cmpl $268435456,%r10d
- je L$14rounds_alt
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
@@ -2183,60 +2065,6 @@
jmp L$enc_key_ret
.p2align 4
-L$14rounds_alt:
- movdqa L$key_rotate(%rip),%xmm5
- movdqa L$key_rcon1(%rip),%xmm4
- movl $7,%r10d
- movdqu %xmm0,0(%rdx)
- movdqa %xmm2,%xmm1
- movdqu %xmm2,16(%rdx)
- jmp L$oop_key256
-
-.p2align 4
-L$oop_key256:
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
-
- movdqa %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm3,%xmm0
- pslld $1,%xmm4
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,(%rax)
-
- decl %r10d
- jz L$done_key256
-
- pshufd $0xff,%xmm0,%xmm2
- pxor %xmm3,%xmm3
-.byte 102,15,56,221,211
-
- movdqa %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm3,%xmm1
-
- pxor %xmm1,%xmm2
- movdqu %xmm2,16(%rax)
- leaq 32(%rax),%rax
- movdqa %xmm2,%xmm1
-
- jmp L$oop_key256
-
-L$done_key256:
- movl %esi,16(%rax)
- xorl %eax,%eax
- jmp L$enc_key_ret
-
-.p2align 4
L$bad_keybits:
movq $-2,%rax
L$enc_key_ret:
@@ -2321,6 +2149,214 @@
xorps %xmm1,%xmm2
ret
+
+.globl _aes_hw_set_encrypt_key_alt
+.private_extern _aes_hw_set_encrypt_key_alt
+
+.p2align 4
+_aes_hw_set_encrypt_key_alt:
+
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+ movb $1,_BORINGSSL_function_hit+3(%rip)
+#endif
+ subq $8,%rsp
+
+
+
+ movups (%rdi),%xmm0
+ xorps %xmm4,%xmm4
+ leaq 16(%rdx),%rax
+ cmpl $256,%esi
+ je L$14rounds_alt
+ cmpl $192,%esi
+ je L$12rounds_alt
+ cmpl $128,%esi
+ jne L$bad_keybits_alt
+
+ movl $9,%esi
+ movdqa L$key_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa L$key_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key128
+
+.p2align 4
+L$oop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz L$oop_key128
+
+ movdqa L$key_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret_alt
+
+.p2align 4
+L$12rounds_alt:
+ movq 16(%rdi),%xmm2
+ movl $11,%esi
+ movdqa L$key_rotate192(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key192
+
+.p2align 4
+L$oop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz L$oop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret_alt
+
+.p2align 4
+L$14rounds_alt:
+ movups 16(%rdi),%xmm2
+ movl $13,%esi
+ leaq 16(%rax),%rax
+ movdqa L$key_rotate(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp L$oop_key256
+
+.p2align 4
+L$oop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz L$done_key256
+
+ pshufd $0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp L$oop_key256
+
+L$done_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret_alt
+
+.p2align 4
+L$bad_keybits_alt:
+ movq $-2,%rax
+L$enc_key_ret_alt:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ addq $8,%rsp
+
+ ret
+
+
+
.section __DATA,__const
.p2align 6
L$bswap_mask:
diff --git a/gen/bcm/aesni-x86_64-linux.S b/gen/bcm/aesni-x86_64-linux.S
index 38ed6e7..cd695b4 100644
--- a/gen/bcm/aesni-x86_64-linux.S
+++ b/gen/bcm/aesni-x86_64-linux.S
@@ -5,8 +5,6 @@
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
.text
-.extern OPENSSL_ia32cap_P
-.hidden OPENSSL_ia32cap_P
.globl aes_hw_encrypt
.hidden aes_hw_encrypt
.type aes_hw_encrypt,@function
@@ -1947,11 +1945,11 @@
ret
.cfi_endproc
.size aes_hw_encrypt_key_to_decrypt_key,.-aes_hw_encrypt_key_to_decrypt_key
-.globl aes_hw_set_encrypt_key
-.hidden aes_hw_set_encrypt_key
-.type aes_hw_set_encrypt_key,@function
+.globl aes_hw_set_encrypt_key_base
+.hidden aes_hw_set_encrypt_key_base
+.type aes_hw_set_encrypt_key_base,@function
.align 16
-aes_hw_set_encrypt_key:
+aes_hw_set_encrypt_key_base:
.cfi_startproc
_CET_ENDBR
@@ -1964,9 +1962,6 @@
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
- leaq OPENSSL_ia32cap_P(%rip),%r10
- movl 4(%r10),%r10d
- andl $268437504,%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je .L14rounds
@@ -1977,8 +1972,6 @@
.L10rounds:
movl $9,%esi
- cmpl $268435456,%r10d
- je .L10rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
@@ -2007,78 +2000,9 @@
jmp .Lenc_key_ret
.align 16
-.L10rounds_alt:
- movdqa .Lkey_rotate(%rip),%xmm5
- movl $8,%r10d
- movdqa .Lkey_rcon1(%rip),%xmm4
- movdqa %xmm0,%xmm2
- movdqu %xmm0,(%rdx)
- jmp .Loop_key128
-
-.align 16
-.Loop_key128:
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
- pslld $1,%xmm4
- leaq 16(%rax),%rax
-
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,-16(%rax)
- movdqa %xmm0,%xmm2
-
- decl %r10d
- jnz .Loop_key128
-
- movdqa .Lkey_rcon1b(%rip),%xmm4
-
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
- pslld $1,%xmm4
-
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,(%rax)
-
- movdqa %xmm0,%xmm2
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
-
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,16(%rax)
-
- movl %esi,96(%rax)
- xorl %eax,%eax
- jmp .Lenc_key_ret
-
-.align 16
.L12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
- cmpl $268435456,%r10d
- je .L12rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
@@ -2103,53 +2027,10 @@
jmp .Lenc_key_ret
.align 16
-.L12rounds_alt:
- movdqa .Lkey_rotate192(%rip),%xmm5
- movdqa .Lkey_rcon1(%rip),%xmm4
- movl $8,%r10d
- movdqu %xmm0,(%rdx)
- jmp .Loop_key192
-
-.align 16
-.Loop_key192:
- movq %xmm2,0(%rax)
- movdqa %xmm2,%xmm1
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
- pslld $1,%xmm4
- leaq 24(%rax),%rax
-
- movdqa %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm3,%xmm0
-
- pshufd $0xff,%xmm0,%xmm3
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
-
- pxor %xmm2,%xmm0
- pxor %xmm3,%xmm2
- movdqu %xmm0,-16(%rax)
-
- decl %r10d
- jnz .Loop_key192
-
- movl %esi,32(%rax)
- xorl %eax,%eax
- jmp .Lenc_key_ret
-
-.align 16
.L14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
- cmpl $268435456,%r10d
- je .L14rounds_alt
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
@@ -2185,60 +2066,6 @@
jmp .Lenc_key_ret
.align 16
-.L14rounds_alt:
- movdqa .Lkey_rotate(%rip),%xmm5
- movdqa .Lkey_rcon1(%rip),%xmm4
- movl $7,%r10d
- movdqu %xmm0,0(%rdx)
- movdqa %xmm2,%xmm1
- movdqu %xmm2,16(%rdx)
- jmp .Loop_key256
-
-.align 16
-.Loop_key256:
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
-
- movdqa %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm3,%xmm0
- pslld $1,%xmm4
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,(%rax)
-
- decl %r10d
- jz .Ldone_key256
-
- pshufd $0xff,%xmm0,%xmm2
- pxor %xmm3,%xmm3
-.byte 102,15,56,221,211
-
- movdqa %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm3,%xmm1
-
- pxor %xmm1,%xmm2
- movdqu %xmm2,16(%rax)
- leaq 32(%rax),%rax
- movdqa %xmm2,%xmm1
-
- jmp .Loop_key256
-
-.Ldone_key256:
- movl %esi,16(%rax)
- xorl %eax,%eax
- jmp .Lenc_key_ret
-
-.align 16
.Lbad_keybits:
movq $-2,%rax
.Lenc_key_ret:
@@ -2322,7 +2149,215 @@
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
-.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+.size aes_hw_set_encrypt_key_base,.-aes_hw_set_encrypt_key_base
+
+.globl aes_hw_set_encrypt_key_alt
+.hidden aes_hw_set_encrypt_key_alt
+.type aes_hw_set_encrypt_key_alt,@function
+.align 16
+aes_hw_set_encrypt_key_alt:
+.cfi_startproc
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+ movb $1,BORINGSSL_function_hit+3(%rip)
+#endif
+ subq $8,%rsp
+.cfi_adjust_cfa_offset 8
+
+
+ movups (%rdi),%xmm0
+ xorps %xmm4,%xmm4
+ leaq 16(%rdx),%rax
+ cmpl $256,%esi
+ je .L14rounds_alt
+ cmpl $192,%esi
+ je .L12rounds_alt
+ cmpl $128,%esi
+ jne .Lbad_keybits_alt
+
+ movl $9,%esi
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret_alt
+
+.align 16
+.L12rounds_alt:
+ movq 16(%rdi),%xmm2
+ movl $11,%esi
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz .Loop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret_alt
+
+.align 16
+.L14rounds_alt:
+ movups 16(%rdi),%xmm2
+ movl $13,%esi
+ leaq 16(%rax),%rax
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz .Ldone_key256
+
+ pshufd $0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret_alt
+
+.align 16
+.Lbad_keybits_alt:
+ movq $-2,%rax
+.Lenc_key_ret_alt:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ addq $8,%rsp
+.cfi_adjust_cfa_offset -8
+ ret
+.cfi_endproc
+
+.size aes_hw_set_encrypt_key_alt,.-aes_hw_set_encrypt_key_alt
.section .rodata
.align 64
.Lbswap_mask:
diff --git a/gen/bcm/aesni-x86_64-win.asm b/gen/bcm/aesni-x86_64-win.asm
index 64dd6be..4876ed7 100644
--- a/gen/bcm/aesni-x86_64-win.asm
+++ b/gen/bcm/aesni-x86_64-win.asm
@@ -13,7 +13,6 @@
%endif
section .text code align=64
-EXTERN OPENSSL_ia32cap_P
global aes_hw_encrypt
ALIGN 16
@@ -2051,25 +2050,22 @@
ret
-global aes_hw_set_encrypt_key
+global aes_hw_set_encrypt_key_base
ALIGN 16
-aes_hw_set_encrypt_key:
+aes_hw_set_encrypt_key_base:
-$L$SEH_begin_aes_hw_set_encrypt_key_1:
+$L$SEH_begin_aes_hw_set_encrypt_key_base_1:
_CET_ENDBR
%ifdef BORINGSSL_DISPATCH_TEST
mov BYTE[((BORINGSSL_function_hit+3))],1
%endif
sub rsp,8
-$L$SEH_prologue_aes_hw_set_encrypt_key_2:
-$L$SEH_endprologue_aes_hw_set_encrypt_key_3:
+$L$SEH_prologue_aes_hw_set_encrypt_key_base_2:
+$L$SEH_endprologue_aes_hw_set_encrypt_key_base_3:
movups xmm0,XMMWORD[rcx]
xorps xmm4,xmm4
- lea r10,[OPENSSL_ia32cap_P]
- mov r10d,DWORD[4+r10]
- and r10d,268437504
lea rax,[16+r8]
cmp edx,256
je NEAR $L$14rounds
@@ -2080,8 +2076,6 @@
$L$10rounds:
mov edx,9
- cmp r10d,268435456
- je NEAR $L$10rounds_alt
movups XMMWORD[r8],xmm0
DB 102,15,58,223,200,1
@@ -2110,78 +2104,9 @@
jmp NEAR $L$enc_key_ret
ALIGN 16
-$L$10rounds_alt:
- movdqa xmm5,XMMWORD[$L$key_rotate]
- mov r10d,8
- movdqa xmm4,XMMWORD[$L$key_rcon1]
- movdqa xmm2,xmm0
- movdqu XMMWORD[r8],xmm0
- jmp NEAR $L$oop_key128
-
-ALIGN 16
-$L$oop_key128:
-DB 102,15,56,0,197
- DB 102,15,56,221,196
- pslld xmm4,1
- lea rax,[16+rax]
-
- movdqa xmm3,xmm2
- pslldq xmm2,4
- pxor xmm3,xmm2
- pslldq xmm2,4
- pxor xmm3,xmm2
- pslldq xmm2,4
- pxor xmm2,xmm3
-
- pxor xmm0,xmm2
- movdqu XMMWORD[(-16)+rax],xmm0
- movdqa xmm2,xmm0
-
- dec r10d
- jnz NEAR $L$oop_key128
-
- movdqa xmm4,XMMWORD[$L$key_rcon1b]
-
-DB 102,15,56,0,197
- DB 102,15,56,221,196
- pslld xmm4,1
-
- movdqa xmm3,xmm2
- pslldq xmm2,4
- pxor xmm3,xmm2
- pslldq xmm2,4
- pxor xmm3,xmm2
- pslldq xmm2,4
- pxor xmm2,xmm3
-
- pxor xmm0,xmm2
- movdqu XMMWORD[rax],xmm0
-
- movdqa xmm2,xmm0
-DB 102,15,56,0,197
- DB 102,15,56,221,196
-
- movdqa xmm3,xmm2
- pslldq xmm2,4
- pxor xmm3,xmm2
- pslldq xmm2,4
- pxor xmm3,xmm2
- pslldq xmm2,4
- pxor xmm2,xmm3
-
- pxor xmm0,xmm2
- movdqu XMMWORD[16+rax],xmm0
-
- mov DWORD[96+rax],edx
- xor eax,eax
- jmp NEAR $L$enc_key_ret
-
-ALIGN 16
$L$12rounds:
movq xmm2,QWORD[16+rcx]
mov edx,11
- cmp r10d,268435456
- je NEAR $L$12rounds_alt
movups XMMWORD[r8],xmm0
DB 102,15,58,223,202,1
@@ -2206,53 +2131,10 @@
jmp NEAR $L$enc_key_ret
ALIGN 16
-$L$12rounds_alt:
- movdqa xmm5,XMMWORD[$L$key_rotate192]
- movdqa xmm4,XMMWORD[$L$key_rcon1]
- mov r10d,8
- movdqu XMMWORD[r8],xmm0
- jmp NEAR $L$oop_key192
-
-ALIGN 16
-$L$oop_key192:
- movq QWORD[rax],xmm2
- movdqa xmm1,xmm2
-DB 102,15,56,0,213
- DB 102,15,56,221,212
- pslld xmm4,1
- lea rax,[24+rax]
-
- movdqa xmm3,xmm0
- pslldq xmm0,4
- pxor xmm3,xmm0
- pslldq xmm0,4
- pxor xmm3,xmm0
- pslldq xmm0,4
- pxor xmm0,xmm3
-
- pshufd xmm3,xmm0,0xff
- pxor xmm3,xmm1
- pslldq xmm1,4
- pxor xmm3,xmm1
-
- pxor xmm0,xmm2
- pxor xmm2,xmm3
- movdqu XMMWORD[(-16)+rax],xmm0
-
- dec r10d
- jnz NEAR $L$oop_key192
-
- mov DWORD[32+rax],edx
- xor eax,eax
- jmp NEAR $L$enc_key_ret
-
-ALIGN 16
$L$14rounds:
movups xmm2,XMMWORD[16+rcx]
mov edx,13
lea rax,[16+rax]
- cmp r10d,268435456
- je NEAR $L$14rounds_alt
movups XMMWORD[r8],xmm0
movups XMMWORD[16+r8],xmm2
@@ -2288,60 +2170,6 @@
jmp NEAR $L$enc_key_ret
ALIGN 16
-$L$14rounds_alt:
- movdqa xmm5,XMMWORD[$L$key_rotate]
- movdqa xmm4,XMMWORD[$L$key_rcon1]
- mov r10d,7
- movdqu XMMWORD[r8],xmm0
- movdqa xmm1,xmm2
- movdqu XMMWORD[16+r8],xmm2
- jmp NEAR $L$oop_key256
-
-ALIGN 16
-$L$oop_key256:
-DB 102,15,56,0,213
- DB 102,15,56,221,212
-
- movdqa xmm3,xmm0
- pslldq xmm0,4
- pxor xmm3,xmm0
- pslldq xmm0,4
- pxor xmm3,xmm0
- pslldq xmm0,4
- pxor xmm0,xmm3
- pslld xmm4,1
-
- pxor xmm0,xmm2
- movdqu XMMWORD[rax],xmm0
-
- dec r10d
- jz NEAR $L$done_key256
-
- pshufd xmm2,xmm0,0xff
- pxor xmm3,xmm3
- DB 102,15,56,221,211
-
- movdqa xmm3,xmm1
- pslldq xmm1,4
- pxor xmm3,xmm1
- pslldq xmm1,4
- pxor xmm3,xmm1
- pslldq xmm1,4
- pxor xmm1,xmm3
-
- pxor xmm2,xmm1
- movdqu XMMWORD[16+rax],xmm2
- lea rax,[32+rax]
- movdqa xmm1,xmm2
-
- jmp NEAR $L$oop_key256
-
-$L$done_key256:
- mov DWORD[16+rax],edx
- xor eax,eax
- jmp NEAR $L$enc_key_ret
-
-ALIGN 16
$L$bad_keybits:
mov rax,-2
$L$enc_key_ret:
@@ -2355,7 +2183,7 @@
ret
-$L$SEH_end_aes_hw_set_encrypt_key_4:
+$L$SEH_end_aes_hw_set_encrypt_key_base_4:
ALIGN 16
$L$key_expansion_128:
@@ -2426,6 +2254,213 @@
xorps xmm2,xmm1
ret
+
+global aes_hw_set_encrypt_key_alt
+
+ALIGN 16
+aes_hw_set_encrypt_key_alt:
+
+$L$SEH_begin_aes_hw_set_encrypt_key_alt_1:
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+ mov BYTE[((BORINGSSL_function_hit+3))],1
+%endif
+ sub rsp,8
+
+$L$SEH_prologue_aes_hw_set_encrypt_key_alt_2:
+$L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3:
+ movups xmm0,XMMWORD[rcx]
+ xorps xmm4,xmm4
+ lea rax,[16+r8]
+ cmp edx,256
+ je NEAR $L$14rounds_alt
+ cmp edx,192
+ je NEAR $L$12rounds_alt
+ cmp edx,128
+ jne NEAR $L$bad_keybits_alt
+
+ mov edx,9
+ movdqa xmm5,XMMWORD[$L$key_rotate]
+ mov r10d,8
+ movdqa xmm4,XMMWORD[$L$key_rcon1]
+ movdqa xmm2,xmm0
+ movdqu XMMWORD[r8],xmm0
+ jmp NEAR $L$oop_key128
+
+ALIGN 16
+$L$oop_key128:
+DB 102,15,56,0,197
+ DB 102,15,56,221,196
+ pslld xmm4,1
+ lea rax,[16+rax]
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD[(-16)+rax],xmm0
+ movdqa xmm2,xmm0
+
+ dec r10d
+ jnz NEAR $L$oop_key128
+
+ movdqa xmm4,XMMWORD[$L$key_rcon1b]
+
+DB 102,15,56,0,197
+ DB 102,15,56,221,196
+ pslld xmm4,1
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD[rax],xmm0
+
+ movdqa xmm2,xmm0
+DB 102,15,56,0,197
+ DB 102,15,56,221,196
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD[16+rax],xmm0
+
+ mov DWORD[96+rax],edx
+ xor eax,eax
+ jmp NEAR $L$enc_key_ret_alt
+
+ALIGN 16
+$L$12rounds_alt:
+ movq xmm2,QWORD[16+rcx]
+ mov edx,11
+ movdqa xmm5,XMMWORD[$L$key_rotate192]
+ movdqa xmm4,XMMWORD[$L$key_rcon1]
+ mov r10d,8
+ movdqu XMMWORD[r8],xmm0
+ jmp NEAR $L$oop_key192
+
+ALIGN 16
+$L$oop_key192:
+ movq QWORD[rax],xmm2
+ movdqa xmm1,xmm2
+DB 102,15,56,0,213
+ DB 102,15,56,221,212
+ pslld xmm4,1
+ lea rax,[24+rax]
+
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+
+ pshufd xmm3,xmm0,0xff
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+
+ pxor xmm0,xmm2
+ pxor xmm2,xmm3
+ movdqu XMMWORD[(-16)+rax],xmm0
+
+ dec r10d
+ jnz NEAR $L$oop_key192
+
+ mov DWORD[32+rax],edx
+ xor eax,eax
+ jmp NEAR $L$enc_key_ret_alt
+
+ALIGN 16
+$L$14rounds_alt:
+ movups xmm2,XMMWORD[16+rcx]
+ mov edx,13
+ lea rax,[16+rax]
+ movdqa xmm5,XMMWORD[$L$key_rotate]
+ movdqa xmm4,XMMWORD[$L$key_rcon1]
+ mov r10d,7
+ movdqu XMMWORD[r8],xmm0
+ movdqa xmm1,xmm2
+ movdqu XMMWORD[16+r8],xmm2
+ jmp NEAR $L$oop_key256
+
+ALIGN 16
+$L$oop_key256:
+DB 102,15,56,0,213
+ DB 102,15,56,221,212
+
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+ pslld xmm4,1
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD[rax],xmm0
+
+ dec r10d
+ jz NEAR $L$done_key256
+
+ pshufd xmm2,xmm0,0xff
+ pxor xmm3,xmm3
+ DB 102,15,56,221,211
+
+ movdqa xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm1,xmm3
+
+ pxor xmm2,xmm1
+ movdqu XMMWORD[16+rax],xmm2
+ lea rax,[32+rax]
+ movdqa xmm1,xmm2
+
+ jmp NEAR $L$oop_key256
+
+$L$done_key256:
+ mov DWORD[16+rax],edx
+ xor eax,eax
+ jmp NEAR $L$enc_key_ret_alt
+
+ALIGN 16
+$L$bad_keybits_alt:
+ mov rax,-2
+$L$enc_key_ret_alt:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ add rsp,8
+
+ ret
+
+$L$SEH_end_aes_hw_set_encrypt_key_alt_4:
+
section .rdata rdata align=8
ALIGN 64
$L$bswap_mask:
@@ -2650,19 +2685,32 @@
DD cbc_se_handler wrt ..imagebase
section .pdata
ALIGN 4
- DD $L$SEH_begin_aes_hw_set_encrypt_key_1 wrt ..imagebase
- DD $L$SEH_end_aes_hw_set_encrypt_key_4 wrt ..imagebase
- DD $L$SEH_info_aes_hw_set_encrypt_key_0 wrt ..imagebase
+ DD $L$SEH_begin_aes_hw_set_encrypt_key_base_1 wrt ..imagebase
+ DD $L$SEH_end_aes_hw_set_encrypt_key_base_4 wrt ..imagebase
+ DD $L$SEH_info_aes_hw_set_encrypt_key_base_0 wrt ..imagebase
+
+ DD $L$SEH_begin_aes_hw_set_encrypt_key_alt_1 wrt ..imagebase
+ DD $L$SEH_end_aes_hw_set_encrypt_key_alt_4 wrt ..imagebase
+ DD $L$SEH_info_aes_hw_set_encrypt_key_alt_0 wrt ..imagebase
section .xdata
ALIGN 4
-$L$SEH_info_aes_hw_set_encrypt_key_0:
+$L$SEH_info_aes_hw_set_encrypt_key_base_0:
DB 1
- DB $L$SEH_endprologue_aes_hw_set_encrypt_key_3-$L$SEH_begin_aes_hw_set_encrypt_key_1
+ DB $L$SEH_endprologue_aes_hw_set_encrypt_key_base_3-$L$SEH_begin_aes_hw_set_encrypt_key_base_1
DB 1
DB 0
- DB $L$SEH_prologue_aes_hw_set_encrypt_key_2-$L$SEH_begin_aes_hw_set_encrypt_key_1
+ DB $L$SEH_prologue_aes_hw_set_encrypt_key_base_2-$L$SEH_begin_aes_hw_set_encrypt_key_base_1
+ DB 2
+
+ DW 0
+$L$SEH_info_aes_hw_set_encrypt_key_alt_0:
+ DB 1
+ DB $L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1
+ DB 1
+ DB 0
+ DB $L$SEH_prologue_aes_hw_set_encrypt_key_alt_2-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1
DB 2
DW 0
diff --git a/gen/bcm/vpaes-x86-apple.S b/gen/bcm/vpaes-x86-apple.S
index 4d2c485..02d3787 100644
--- a/gen/bcm/vpaes-x86-apple.S
+++ b/gen/bcm/vpaes-x86-apple.S
@@ -470,10 +470,10 @@
#ifdef BORINGSSL_DISPATCH_TEST
pushl %ebx
pushl %edx
- call L016pic
-L016pic:
+ call L016pic_for_function_hit
+L016pic_for_function_hit:
popl %ebx
- leal _BORINGSSL_function_hit+5-L016pic(%ebx),%ebx
+ leal _BORINGSSL_function_hit+5-L016pic_for_function_hit(%ebx),%ebx
movl $1,%edx
movb %dl,(%ebx)
popl %edx
@@ -551,10 +551,10 @@
#ifdef BORINGSSL_DISPATCH_TEST
pushl %ebx
pushl %edx
- call L019pic
-L019pic:
+ call L019pic_for_function_hit
+L019pic_for_function_hit:
popl %ebx
- leal _BORINGSSL_function_hit+4-L019pic(%ebx),%ebx
+ leal _BORINGSSL_function_hit+4-L019pic_for_function_hit(%ebx),%ebx
movl $1,%edx
movb %dl,(%ebx)
popl %edx
diff --git a/gen/bcm/vpaes-x86-linux.S b/gen/bcm/vpaes-x86-linux.S
index 02786a7..31dc9a0 100644
--- a/gen/bcm/vpaes-x86-linux.S
+++ b/gen/bcm/vpaes-x86-linux.S
@@ -487,10 +487,10 @@
#ifdef BORINGSSL_DISPATCH_TEST
pushl %ebx
pushl %edx
- call .L016pic
-.L016pic:
+ call .L016pic_for_function_hit
+.L016pic_for_function_hit:
popl %ebx
- leal BORINGSSL_function_hit+5-.L016pic(%ebx),%ebx
+ leal BORINGSSL_function_hit+5-.L016pic_for_function_hit(%ebx),%ebx
movl $1,%edx
movb %dl,(%ebx)
popl %edx
@@ -572,10 +572,10 @@
#ifdef BORINGSSL_DISPATCH_TEST
pushl %ebx
pushl %edx
- call .L019pic
-.L019pic:
+ call .L019pic_for_function_hit
+.L019pic_for_function_hit:
popl %ebx
- leal BORINGSSL_function_hit+4-.L019pic(%ebx),%ebx
+ leal BORINGSSL_function_hit+4-.L019pic_for_function_hit(%ebx),%ebx
movl $1,%edx
movb %dl,(%ebx)
popl %edx
diff --git a/gen/bcm/vpaes-x86-win.asm b/gen/bcm/vpaes-x86-win.asm
index 661496e..3f087e1 100644
--- a/gen/bcm/vpaes-x86-win.asm
+++ b/gen/bcm/vpaes-x86-win.asm
@@ -470,10 +470,10 @@
%ifdef BORINGSSL_DISPATCH_TEST
push ebx
push edx
- call L$016pic
-L$016pic:
+ call L$016pic_for_function_hit
+L$016pic_for_function_hit:
pop ebx
- lea ebx,[(_BORINGSSL_function_hit+5-L$016pic)+ebx]
+ lea ebx,[(_BORINGSSL_function_hit+5-L$016pic_for_function_hit)+ebx]
mov edx,1
mov BYTE [ebx],dl
pop edx
@@ -549,10 +549,10 @@
%ifdef BORINGSSL_DISPATCH_TEST
push ebx
push edx
- call L$019pic
-L$019pic:
+ call L$019pic_for_function_hit
+L$019pic_for_function_hit:
pop ebx
- lea ebx,[(_BORINGSSL_function_hit+4-L$019pic)+ebx]
+ lea ebx,[(_BORINGSSL_function_hit+4-L$019pic_for_function_hit)+ebx]
mov edx,1
mov BYTE [ebx],dl
pop edx