Disable the "AVX10/256" AES-GCM functions for now
Since there is now a VAES+AVX2 implementation of AES-GCM, and the future
of AVX10/256 is uncertain, disable the AES-GCM functions that use
AVX10/256 (equivalently AVX512 with a maximum vector length of 256
bits). This leaves VAES+AVX2 as the sole 256-bit support for now.
For now this just affects Intel Ice Lake and Tiger Lake (which actually
support AVX512, but where downclocking issues make 256-bit arguably
preferable to 512-bit), where a slight performance loss is seen on long
messages. The following tables compare AES-256-GCM throughput in MB/s
on Ice Lake server for various message lengths:
Encryption:
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
--------+-------+-------+-------+-------+-------+-------+
Before | 7533 | 6990 | 6220 | 5096 | 4200 | 2702 |
After | 7403 | 6879 | 6236 | 4980 | 4040 | 2868 |
| 300 | 200 | 64 | 63 | 16 |
--------+-------+-------+-------+-------+-------+
Before | 2086 | 1555 | 1031 | 657 | 433 |
After | 2069 | 1635 | 1045 | 667 | 430 |
Decryption:
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
--------+-------+-------+-------+-------+-------+-------+
Before | 7703 | 7140 | 6524 | 5283 | 4244 | 2990 |
After | 7572 | 7056 | 6494 | 5155 | 4224 | 3073 |
| 300 | 200 | 64 | 63 | 16 |
--------+-------+-------+-------+-------+-------+
Before | 2276 | 1733 | 1070 | 680 | 447 |
After | 2249 | 1743 | 1100 | 692 | 447 |
This change should be reconsidered if AVX10/256 sees widespread support,
as we shouldn't carry forward a restriction to AVX2 unnecessarily.
This change also replaces gcm_init_vpclmulqdq_avx10 with
gcm_init_vpclmulqdq_avx10_512, now instantiated using 512-bit vectors.
Otherwise it would be the only avx10 function left using 256-bit.
Change-Id: I7fd21568482118a2ce7a382e9042b187cd2739f7
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/74369
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl
index 269ac80..06ea7e6 100644
--- a/crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl
+++ b/crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl
@@ -1321,26 +1321,31 @@
}
$code .= _end_func;
-_set_veclen 32;
-
-$code .= _begin_func "gcm_init_vpclmulqdq_avx10", 0;
-$code .= _aes_gcm_init;
-$code .= _end_func;
-
-$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_256", 1;
-$code .= _ghash_update;
-$code .= _end_func;
-
-$code .= _begin_func "aes_gcm_enc_update_vaes_avx10_256", 1;
-$code .= _aes_gcm_update 1;
-$code .= _end_func;
-
-$code .= _begin_func "aes_gcm_dec_update_vaes_avx10_256", 1;
-$code .= _aes_gcm_update 0;
-$code .= _end_func;
+# Disabled until significant deployment of AVX10/256 is seen. The separate
+# *_vaes_avx2 implementation provides the only 256-bit support for now.
+#
+# $code .= _begin_func "gcm_init_vpclmulqdq_avx10_256", 0;
+# $code .= _aes_gcm_init;
+# $code .= _end_func;
+#
+# $code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_256", 1;
+# $code .= _ghash_update;
+# $code .= _end_func;
+#
+# $code .= _begin_func "aes_gcm_enc_update_vaes_avx10_256", 1;
+# $code .= _aes_gcm_update 1;
+# $code .= _end_func;
+#
+# $code .= _begin_func "aes_gcm_dec_update_vaes_avx10_256", 1;
+# $code .= _aes_gcm_update 0;
+# $code .= _end_func;
_set_veclen 64;
+$code .= _begin_func "gcm_init_vpclmulqdq_avx10_512", 0;
+$code .= _aes_gcm_init;
+$code .= _end_func;
+
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512", 1;
$code .= _ghash_update;
$code .= _end_func;
diff --git a/crypto/fipsmodule/modes/gcm.cc.inc b/crypto/fipsmodule/modes/gcm.cc.inc
index e77c525..d8ccf00 100644
--- a/crypto/fipsmodule/modes/gcm.cc.inc
+++ b/crypto/fipsmodule/modes/gcm.cc.inc
@@ -104,11 +104,6 @@
aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec, Htable, Xi);
CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
return len;
- case gcm_x86_vaes_avx10_256:
- len &= kSizeTWithoutLower4Bits;
- aes_gcm_enc_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi);
- CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
- return len;
case gcm_x86_vaes_avx10_512:
len &= kSizeTWithoutLower4Bits;
aes_gcm_enc_update_vaes_avx10_512(in, out, len, key, ivec, Htable, Xi);
@@ -129,11 +124,6 @@
aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec, Htable, Xi);
CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
return len;
- case gcm_x86_vaes_avx10_256:
- len &= kSizeTWithoutLower4Bits;
- aes_gcm_dec_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi);
- CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
- return len;
case gcm_x86_vaes_avx10_512:
len &= kSizeTWithoutLower4Bits;
aes_gcm_dec_update_vaes_avx10_512(in, out, len, key, ivec, Htable, Xi);
@@ -183,14 +173,10 @@
if (crypto_gcm_clmul_enabled()) {
if (CRYPTO_is_VPCLMULQDQ_capable() && CRYPTO_is_AVX2_capable()) {
if (CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
- CRYPTO_is_BMI2_capable()) {
- gcm_init_vpclmulqdq_avx10(out_table, H);
+ CRYPTO_is_BMI2_capable() && !CRYPTO_cpu_avoid_zmm_registers()) {
+ gcm_init_vpclmulqdq_avx10_512(out_table, H);
*out_mult = gcm_gmult_vpclmulqdq_avx10;
- if (CRYPTO_cpu_avoid_zmm_registers()) {
- *out_hash = gcm_ghash_vpclmulqdq_avx10_256;
- } else {
- *out_hash = gcm_ghash_vpclmulqdq_avx10_512;
- }
+ *out_hash = gcm_ghash_vpclmulqdq_avx10_512;
return;
}
gcm_init_vpclmulqdq_avx2(out_table, H);
@@ -275,11 +261,8 @@
#if !defined(OPENSSL_NO_ASM)
#if defined(OPENSSL_X86_64)
- if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx10_256 &&
+ if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx10_512 &&
CRYPTO_is_VAES_capable()) {
- gcm_key->impl = gcm_x86_vaes_avx10_256;
- } else if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx10_512 &&
- CRYPTO_is_VAES_capable()) {
gcm_key->impl = gcm_x86_vaes_avx10_512;
} else if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx2 &&
CRYPTO_is_VAES_capable()) {
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index d195526..6329675 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -111,21 +111,15 @@
static const uint8_t kKey[16] = {0};
uint8_t iv[16] = {0};
- CHECK_ABI_SEH(gcm_init_vpclmulqdq_avx10, Htable, kH);
+ CHECK_ABI_SEH(gcm_init_vpclmulqdq_avx10_512, Htable, kH);
CHECK_ABI_SEH(gcm_gmult_vpclmulqdq_avx10, X, Htable);
for (size_t blocks : kBlockCounts) {
- CHECK_ABI_SEH(gcm_ghash_vpclmulqdq_avx10_256, X, Htable, buf,
- 16 * blocks);
CHECK_ABI_SEH(gcm_ghash_vpclmulqdq_avx10_512, X, Htable, buf,
16 * blocks);
}
aes_hw_set_encrypt_key(kKey, 128, &aes_key);
for (size_t blocks : kBlockCounts) {
- CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_256, buf, buf, blocks * 16,
- &aes_key, iv, Htable, X);
- CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_256, buf, buf,
- blocks * 16 + 7, &aes_key, iv, Htable, X);
CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_512, buf, buf, blocks * 16,
&aes_key, iv, Htable, X);
CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_512, buf, buf,
@@ -133,10 +127,6 @@
}
aes_hw_set_decrypt_key(kKey, 128, &aes_key);
for (size_t blocks : kBlockCounts) {
- CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_256, buf, buf, blocks * 16,
- &aes_key, iv, Htable, X);
- CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_256, buf, buf,
- blocks * 16 + 7, &aes_key, iv, Htable, X);
CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_512, buf, buf, blocks * 16,
&aes_key, iv, Htable, X);
CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_512, buf, buf,
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index f041bf8..7a6e9aa 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -70,7 +70,6 @@
gcm_separate = 0, // No combined AES-GCM, but may have AES-CTR and GHASH.
gcm_x86_aesni,
gcm_x86_vaes_avx2,
- gcm_x86_vaes_avx10_256,
gcm_x86_vaes_avx10_512,
gcm_arm64_aes,
};
@@ -212,20 +211,10 @@
const AES_KEY *key, const uint8_t ivec[16],
const u128 Htable[16], uint8_t Xi[16]);
-void gcm_init_vpclmulqdq_avx10(u128 Htable[16], const uint64_t H[2]);
+void gcm_init_vpclmulqdq_avx10_512(u128 Htable[16], const uint64_t H[2]);
void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
-void gcm_ghash_vpclmulqdq_avx10_256(uint8_t Xi[16], const u128 Htable[16],
- const uint8_t *in, size_t len);
void gcm_ghash_vpclmulqdq_avx10_512(uint8_t Xi[16], const u128 Htable[16],
const uint8_t *in, size_t len);
-void aes_gcm_enc_update_vaes_avx10_256(const uint8_t *in, uint8_t *out,
- size_t len, const AES_KEY *key,
- const uint8_t ivec[16],
- const u128 Htable[16], uint8_t Xi[16]);
-void aes_gcm_dec_update_vaes_avx10_256(const uint8_t *in, uint8_t *out,
- size_t len, const AES_KEY *key,
- const uint8_t ivec[16],
- const u128 Htable[16], uint8_t Xi[16]);
void aes_gcm_enc_update_vaes_avx10_512(const uint8_t *in, uint8_t *out,
size_t len, const AES_KEY *key,
const uint8_t ivec[16],
diff --git a/crypto/impl_dispatch_test.cc b/crypto/impl_dispatch_test.cc
index bfd0045..26913c7 100644
--- a/crypto/impl_dispatch_test.cc
+++ b/crypto/impl_dispatch_test.cc
@@ -95,7 +95,6 @@
constexpr size_t kFlag_aes_hw_set_encrypt_key = 3;
constexpr size_t kFlag_vpaes_encrypt = 4;
constexpr size_t kFlag_vpaes_set_encrypt_key = 5;
-constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_256 = 6;
constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_512 = 7;
constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx2 = 8;
@@ -109,11 +108,10 @@
is_x86_64_ && aesni_ && avx_movbe_ && !vaes_},
{kFlag_vpaes_encrypt, ssse3_ && !aesni_},
{kFlag_vpaes_set_encrypt_key, ssse3_ && !aesni_},
- {kFlag_aes_gcm_enc_update_vaes_avx10_256,
- is_x86_64_ && vaes_ && avx10_ && avoid_zmm_},
{kFlag_aes_gcm_enc_update_vaes_avx10_512,
is_x86_64_ && vaes_ && avx10_ && !avoid_zmm_},
- {kFlag_aes_gcm_enc_update_vaes_avx2, is_x86_64_ && vaes_ && !avx10_},
+ {kFlag_aes_gcm_enc_update_vaes_avx2,
+ is_x86_64_ && vaes_ && !(avx10_ && !avoid_zmm_)},
},
[] {
const uint8_t kZeros[16] = {0};
diff --git a/crypto/internal.h b/crypto/internal.h
index 62273c6..5ebfaff 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -1408,7 +1408,7 @@
// 3: aes_hw_set_encrypt_key
// 4: vpaes_encrypt
// 5: vpaes_set_encrypt_key
-// 6: aes_gcm_enc_update_vaes_avx10_256
+// 6: aes_gcm_enc_update_vaes_avx10_256 [reserved]
// 7: aes_gcm_enc_update_vaes_avx10_512
// 8: aes_gcm_enc_update_vaes_avx2
extern uint8_t BORINGSSL_function_hit[9];
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-apple.S b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
index b75bb07..54fcde0 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-apple.S
+++ b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
@@ -75,16 +75,16 @@
-.globl _gcm_init_vpclmulqdq_avx10
-.private_extern _gcm_init_vpclmulqdq_avx10
+.globl _gcm_init_vpclmulqdq_avx10_512
+.private_extern _gcm_init_vpclmulqdq_avx10_512
.p2align 5
-_gcm_init_vpclmulqdq_avx10:
+_gcm_init_vpclmulqdq_avx10_512:
_CET_ENDBR
- leaq 256-32(%rdi),%r8
+ leaq 256-64(%rdi),%r8
@@ -112,7 +112,7 @@
vpternlogd $0x78,L$gfpoly_and_internal_carrybit(%rip),%xmm0,%xmm3
- vbroadcasti32x4 L$gfpoly(%rip),%ymm5
+ vbroadcasti32x4 L$gfpoly(%rip),%zmm5
@@ -137,16 +137,6 @@
vinserti128 $1,%xmm3,%ymm4,%ymm3
vinserti128 $1,%xmm4,%ymm4,%ymm4
-
- vmovdqu8 %ymm3,(%r8)
-
-
-
-
-
- movl $7,%eax
-L$precompute_next__func1:
- subq $32,%r8
vpclmulqdq $0x00,%ymm4,%ymm3,%ymm0
vpclmulqdq $0x01,%ymm4,%ymm3,%ymm1
vpclmulqdq $0x10,%ymm4,%ymm3,%ymm2
@@ -154,12 +144,36 @@
vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2
vpshufd $0x4e,%ymm0,%ymm0
vpternlogd $0x96,%ymm2,%ymm0,%ymm1
- vpclmulqdq $0x11,%ymm4,%ymm3,%ymm3
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm4
vpclmulqdq $0x01,%ymm1,%ymm5,%ymm0
vpshufd $0x4e,%ymm1,%ymm1
- vpternlogd $0x96,%ymm0,%ymm1,%ymm3
+ vpternlogd $0x96,%ymm0,%ymm1,%ymm4
- vmovdqu8 %ymm3,(%r8)
+ vinserti64x4 $1,%ymm3,%zmm4,%zmm3
+ vshufi64x2 $0,%zmm4,%zmm4,%zmm4
+
+ vmovdqu8 %zmm3,(%r8)
+
+
+
+
+
+ movl $3,%eax
+L$precompute_next__func1:
+ subq $64,%r8
+ vpclmulqdq $0x00,%zmm4,%zmm3,%zmm0
+ vpclmulqdq $0x01,%zmm4,%zmm3,%zmm1
+ vpclmulqdq $0x10,%zmm4,%zmm3,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm0,%zmm5,%zmm2
+ vpshufd $0x4e,%zmm0,%zmm0
+ vpternlogd $0x96,%zmm2,%zmm0,%zmm1
+ vpclmulqdq $0x11,%zmm4,%zmm3,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm5,%zmm0
+ vpshufd $0x4e,%zmm1,%zmm1
+ vpternlogd $0x96,%zmm0,%zmm1,%zmm3
+
+ vmovdqu8 %zmm3,(%r8)
decl %eax
jnz L$precompute_next__func1
@@ -168,1042 +182,6 @@
-.globl _gcm_ghash_vpclmulqdq_avx10_256
-.private_extern _gcm_ghash_vpclmulqdq_avx10_256
-
-.p2align 5
-_gcm_ghash_vpclmulqdq_avx10_256:
-
-
-_CET_ENDBR
-
-
-
-
-
-
- vmovdqu L$bswap_mask(%rip),%xmm4
- vmovdqu L$gfpoly(%rip),%xmm10
-
-
- vmovdqu (%rdi),%xmm5
- vpshufb %xmm4,%xmm5,%xmm5
-
-
- cmpq $32,%rcx
- jb L$aad_blockbyblock__func1
-
-
-
- vshufi64x2 $0,%ymm4,%ymm4,%ymm4
- vshufi64x2 $0,%ymm10,%ymm10,%ymm10
-
-
- vmovdqu8 256-32(%rsi),%ymm9
-
- cmpq $128-1,%rcx
- jbe L$aad_loop_1x__func1
-
-
- vmovdqu8 256-128(%rsi),%ymm6
- vmovdqu8 256-96(%rsi),%ymm7
- vmovdqu8 256-64(%rsi),%ymm8
-
-
-L$aad_loop_4x__func1:
- vmovdqu8 0(%rdx),%ymm0
- vmovdqu8 32(%rdx),%ymm1
- vmovdqu8 64(%rdx),%ymm2
- vmovdqu8 96(%rdx),%ymm3
- vpshufb %ymm4,%ymm0,%ymm0
- vpxord %ymm5,%ymm0,%ymm0
- vpshufb %ymm4,%ymm1,%ymm1
- vpshufb %ymm4,%ymm2,%ymm2
- vpshufb %ymm4,%ymm3,%ymm3
- vpclmulqdq $0x00,%ymm6,%ymm0,%ymm5
- vpclmulqdq $0x00,%ymm7,%ymm1,%ymm11
- vpclmulqdq $0x00,%ymm8,%ymm2,%ymm12
- vpxord %ymm11,%ymm5,%ymm5
- vpclmulqdq $0x00,%ymm9,%ymm3,%ymm13
- vpternlogd $0x96,%ymm13,%ymm12,%ymm5
- vpclmulqdq $0x01,%ymm6,%ymm0,%ymm11
- vpclmulqdq $0x01,%ymm7,%ymm1,%ymm12
- vpclmulqdq $0x01,%ymm8,%ymm2,%ymm13
- vpternlogd $0x96,%ymm13,%ymm12,%ymm11
- vpclmulqdq $0x01,%ymm9,%ymm3,%ymm12
- vpclmulqdq $0x10,%ymm6,%ymm0,%ymm13
- vpternlogd $0x96,%ymm13,%ymm12,%ymm11
- vpclmulqdq $0x10,%ymm7,%ymm1,%ymm12
- vpclmulqdq $0x10,%ymm8,%ymm2,%ymm13
- vpternlogd $0x96,%ymm13,%ymm12,%ymm11
- vpclmulqdq $0x01,%ymm5,%ymm10,%ymm13
- vpclmulqdq $0x10,%ymm9,%ymm3,%ymm12
- vpxord %ymm12,%ymm11,%ymm11
- vpshufd $0x4e,%ymm5,%ymm5
- vpclmulqdq $0x11,%ymm6,%ymm0,%ymm0
- vpclmulqdq $0x11,%ymm7,%ymm1,%ymm1
- vpclmulqdq $0x11,%ymm8,%ymm2,%ymm2
- vpternlogd $0x96,%ymm13,%ymm5,%ymm11
- vpclmulqdq $0x11,%ymm9,%ymm3,%ymm3
- vpternlogd $0x96,%ymm2,%ymm1,%ymm0
- vpclmulqdq $0x01,%ymm11,%ymm10,%ymm12
- vpxord %ymm3,%ymm0,%ymm5
- vpshufd $0x4e,%ymm11,%ymm11
- vpternlogd $0x96,%ymm12,%ymm11,%ymm5
- vextracti32x4 $1,%ymm5,%xmm0
- vpxord %xmm0,%xmm5,%xmm5
-
- subq $-128,%rdx
- addq $-128,%rcx
- cmpq $128-1,%rcx
- ja L$aad_loop_4x__func1
-
-
- cmpq $32,%rcx
- jb L$aad_large_done__func1
-L$aad_loop_1x__func1:
- vmovdqu8 (%rdx),%ymm0
- vpshufb %ymm4,%ymm0,%ymm0
- vpxord %ymm0,%ymm5,%ymm5
- vpclmulqdq $0x00,%ymm9,%ymm5,%ymm0
- vpclmulqdq $0x01,%ymm9,%ymm5,%ymm1
- vpclmulqdq $0x10,%ymm9,%ymm5,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vpclmulqdq $0x01,%ymm0,%ymm10,%ymm2
- vpshufd $0x4e,%ymm0,%ymm0
- vpternlogd $0x96,%ymm2,%ymm0,%ymm1
- vpclmulqdq $0x11,%ymm9,%ymm5,%ymm5
- vpclmulqdq $0x01,%ymm1,%ymm10,%ymm0
- vpshufd $0x4e,%ymm1,%ymm1
- vpternlogd $0x96,%ymm0,%ymm1,%ymm5
-
- vextracti32x4 $1,%ymm5,%xmm0
- vpxord %xmm0,%xmm5,%xmm5
-
- addq $32,%rdx
- subq $32,%rcx
- cmpq $32,%rcx
- jae L$aad_loop_1x__func1
-
-L$aad_large_done__func1:
-
-
- vzeroupper
-
-
-L$aad_blockbyblock__func1:
- testq %rcx,%rcx
- jz L$aad_done__func1
- vmovdqu 256-16(%rsi),%xmm9
-L$aad_loop_blockbyblock__func1:
- vmovdqu (%rdx),%xmm0
- vpshufb %xmm4,%xmm0,%xmm0
- vpxor %xmm0,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm9,%xmm5,%xmm0
- vpclmulqdq $0x01,%xmm9,%xmm5,%xmm1
- vpclmulqdq $0x10,%xmm9,%xmm5,%xmm2
- vpxord %xmm2,%xmm1,%xmm1
- vpclmulqdq $0x01,%xmm0,%xmm10,%xmm2
- vpshufd $0x4e,%xmm0,%xmm0
- vpternlogd $0x96,%xmm2,%xmm0,%xmm1
- vpclmulqdq $0x11,%xmm9,%xmm5,%xmm5
- vpclmulqdq $0x01,%xmm1,%xmm10,%xmm0
- vpshufd $0x4e,%xmm1,%xmm1
- vpternlogd $0x96,%xmm0,%xmm1,%xmm5
-
- addq $16,%rdx
- subq $16,%rcx
- jnz L$aad_loop_blockbyblock__func1
-
-L$aad_done__func1:
-
- vpshufb %xmm4,%xmm5,%xmm5
- vmovdqu %xmm5,(%rdi)
- ret
-
-
-
-.globl _aes_gcm_enc_update_vaes_avx10_256
-.private_extern _aes_gcm_enc_update_vaes_avx10_256
-
-.p2align 5
-_aes_gcm_enc_update_vaes_avx10_256:
-
-
-_CET_ENDBR
- pushq %r12
-
-
- movq 16(%rsp),%r12
-#ifdef BORINGSSL_DISPATCH_TEST
-
- movb $1,_BORINGSSL_function_hit+6(%rip)
-#endif
-
- vbroadcasti32x4 L$bswap_mask(%rip),%ymm8
- vbroadcasti32x4 L$gfpoly(%rip),%ymm31
-
-
-
- vmovdqu (%r12),%xmm10
- vpshufb %xmm8,%xmm10,%xmm10
- vbroadcasti32x4 (%r8),%ymm12
- vpshufb %ymm8,%ymm12,%ymm12
-
-
-
- movl 240(%rcx),%r10d
- leal -20(,%r10,4),%r10d
-
-
-
-
- leaq 96(%rcx,%r10,4),%r11
- vbroadcasti32x4 (%rcx),%ymm13
- vbroadcasti32x4 (%r11),%ymm14
-
-
- vpaddd L$ctr_pattern(%rip),%ymm12,%ymm12
-
-
- vbroadcasti32x4 L$inc_2blocks(%rip),%ymm11
-
-
-
- cmpq $128-1,%rdx
- jbe L$crypt_loop_4x_done__func1
-
-
- vmovdqu8 256-128(%r9),%ymm27
- vmovdqu8 256-96(%r9),%ymm28
- vmovdqu8 256-64(%r9),%ymm29
- vmovdqu8 256-32(%r9),%ymm30
-
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm1
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm2
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm3
- vpaddd %ymm11,%ymm12,%ymm12
-
-
- vpxord %ymm13,%ymm0,%ymm0
- vpxord %ymm13,%ymm1,%ymm1
- vpxord %ymm13,%ymm2,%ymm2
- vpxord %ymm13,%ymm3,%ymm3
-
- leaq 16(%rcx),%rax
-L$vaesenc_loop_first_4_vecs__func1:
- vbroadcasti32x4 (%rax),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
- addq $16,%rax
- cmpq %rax,%r11
- jne L$vaesenc_loop_first_4_vecs__func1
-
-
-
- vpxord 0(%rdi),%ymm14,%ymm4
- vpxord 32(%rdi),%ymm14,%ymm5
- vpxord 64(%rdi),%ymm14,%ymm6
- vpxord 96(%rdi),%ymm14,%ymm7
-
-
-
- vaesenclast %ymm4,%ymm0,%ymm4
- vaesenclast %ymm5,%ymm1,%ymm5
- vaesenclast %ymm6,%ymm2,%ymm6
- vaesenclast %ymm7,%ymm3,%ymm7
-
-
- vmovdqu8 %ymm4,0(%rsi)
- vmovdqu8 %ymm5,32(%rsi)
- vmovdqu8 %ymm6,64(%rsi)
- vmovdqu8 %ymm7,96(%rsi)
-
- subq $-128,%rdi
- subq $-128,%rsi
- addq $-128,%rdx
- cmpq $128-1,%rdx
- jbe L$ghash_last_ciphertext_4x__func1
- vbroadcasti32x4 -144(%r11),%ymm15
- vbroadcasti32x4 -128(%r11),%ymm16
- vbroadcasti32x4 -112(%r11),%ymm17
- vbroadcasti32x4 -96(%r11),%ymm18
- vbroadcasti32x4 -80(%r11),%ymm19
- vbroadcasti32x4 -64(%r11),%ymm20
- vbroadcasti32x4 -48(%r11),%ymm21
- vbroadcasti32x4 -32(%r11),%ymm22
- vbroadcasti32x4 -16(%r11),%ymm23
-L$crypt_loop_4x__func1:
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm1
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm2
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm3
- vpaddd %ymm11,%ymm12,%ymm12
-
-
- vpxord %ymm13,%ymm0,%ymm0
- vpxord %ymm13,%ymm1,%ymm1
- vpxord %ymm13,%ymm2,%ymm2
- vpxord %ymm13,%ymm3,%ymm3
-
- cmpl $24,%r10d
- jl L$aes128__func1
- je L$aes192__func1
-
- vbroadcasti32x4 -208(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
- vbroadcasti32x4 -192(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
-L$aes192__func1:
- vbroadcasti32x4 -176(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
- vbroadcasti32x4 -160(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
-L$aes128__func1:
- vpshufb %ymm8,%ymm4,%ymm4
- vpxord %ymm10,%ymm4,%ymm4
- vpshufb %ymm8,%ymm5,%ymm5
- vpshufb %ymm8,%ymm6,%ymm6
-
- vaesenc %ymm15,%ymm0,%ymm0
- vaesenc %ymm15,%ymm1,%ymm1
- vaesenc %ymm15,%ymm2,%ymm2
- vaesenc %ymm15,%ymm3,%ymm3
-
- vpshufb %ymm8,%ymm7,%ymm7
- vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
- vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
- vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
-
- vaesenc %ymm16,%ymm0,%ymm0
- vaesenc %ymm16,%ymm1,%ymm1
- vaesenc %ymm16,%ymm2,%ymm2
- vaesenc %ymm16,%ymm3,%ymm3
-
- vpxord %ymm24,%ymm10,%ymm10
- vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm10
- vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
-
- vaesenc %ymm17,%ymm0,%ymm0
- vaesenc %ymm17,%ymm1,%ymm1
- vaesenc %ymm17,%ymm2,%ymm2
- vaesenc %ymm17,%ymm3,%ymm3
-
- vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
-
- vaesenc %ymm18,%ymm0,%ymm0
- vaesenc %ymm18,%ymm1,%ymm1
- vaesenc %ymm18,%ymm2,%ymm2
- vaesenc %ymm18,%ymm3,%ymm3
-
- vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
-
- vaesenc %ymm19,%ymm0,%ymm0
- vaesenc %ymm19,%ymm1,%ymm1
- vaesenc %ymm19,%ymm2,%ymm2
- vaesenc %ymm19,%ymm3,%ymm3
-
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
- vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
- vpxord %ymm25,%ymm24,%ymm24
-
- vaesenc %ymm20,%ymm0,%ymm0
- vaesenc %ymm20,%ymm1,%ymm1
- vaesenc %ymm20,%ymm2,%ymm2
- vaesenc %ymm20,%ymm3,%ymm3
-
- vpshufd $0x4e,%ymm10,%ymm10
- vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
- vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
- vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
-
- vaesenc %ymm21,%ymm0,%ymm0
- vaesenc %ymm21,%ymm1,%ymm1
- vaesenc %ymm21,%ymm2,%ymm2
- vaesenc %ymm21,%ymm3,%ymm3
-
- vpternlogd $0x96,%ymm26,%ymm10,%ymm24
- vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
- vpternlogd $0x96,%ymm6,%ymm5,%ymm4
- vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
-
- vaesenc %ymm22,%ymm0,%ymm0
- vaesenc %ymm22,%ymm1,%ymm1
- vaesenc %ymm22,%ymm2,%ymm2
- vaesenc %ymm22,%ymm3,%ymm3
-
- vpxord %ymm7,%ymm4,%ymm10
- vpshufd $0x4e,%ymm24,%ymm24
- vpternlogd $0x96,%ymm25,%ymm24,%ymm10
-
- vaesenc %ymm23,%ymm0,%ymm0
- vaesenc %ymm23,%ymm1,%ymm1
- vaesenc %ymm23,%ymm2,%ymm2
- vaesenc %ymm23,%ymm3,%ymm3
-
- vextracti32x4 $1,%ymm10,%xmm4
- vpxord %xmm4,%xmm10,%xmm10
-
-
-
-
- vpxord 0(%rdi),%ymm14,%ymm4
- vpxord 32(%rdi),%ymm14,%ymm5
- vpxord 64(%rdi),%ymm14,%ymm6
- vpxord 96(%rdi),%ymm14,%ymm7
-
-
-
- vaesenclast %ymm4,%ymm0,%ymm4
- vaesenclast %ymm5,%ymm1,%ymm5
- vaesenclast %ymm6,%ymm2,%ymm6
- vaesenclast %ymm7,%ymm3,%ymm7
-
-
- vmovdqu8 %ymm4,0(%rsi)
- vmovdqu8 %ymm5,32(%rsi)
- vmovdqu8 %ymm6,64(%rsi)
- vmovdqu8 %ymm7,96(%rsi)
-
- subq $-128,%rdi
- subq $-128,%rsi
- addq $-128,%rdx
- cmpq $128-1,%rdx
- ja L$crypt_loop_4x__func1
-L$ghash_last_ciphertext_4x__func1:
- vpshufb %ymm8,%ymm4,%ymm4
- vpxord %ymm10,%ymm4,%ymm4
- vpshufb %ymm8,%ymm5,%ymm5
- vpshufb %ymm8,%ymm6,%ymm6
- vpshufb %ymm8,%ymm7,%ymm7
- vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
- vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
- vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
- vpxord %ymm24,%ymm10,%ymm10
- vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm10
- vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
- vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
- vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
- vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
- vpxord %ymm25,%ymm24,%ymm24
- vpshufd $0x4e,%ymm10,%ymm10
- vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
- vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
- vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
- vpternlogd $0x96,%ymm26,%ymm10,%ymm24
- vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
- vpternlogd $0x96,%ymm6,%ymm5,%ymm4
- vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
- vpxord %ymm7,%ymm4,%ymm10
- vpshufd $0x4e,%ymm24,%ymm24
- vpternlogd $0x96,%ymm25,%ymm24,%ymm10
- vextracti32x4 $1,%ymm10,%xmm4
- vpxord %xmm4,%xmm10,%xmm10
-
-L$crypt_loop_4x_done__func1:
-
- testq %rdx,%rdx
- jz L$done__func1
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- movq %rdx,%rax
- negq %rax
- andq $-16,%rax
- leaq 256(%r9,%rax,1),%r8
- vpxor %xmm4,%xmm4,%xmm4
- vpxor %xmm5,%xmm5,%xmm5
- vpxor %xmm6,%xmm6,%xmm6
-
- cmpq $32,%rdx
- jb L$partial_vec__func1
-
-L$crypt_loop_1x__func1:
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpaddd %ymm11,%ymm12,%ymm12
- vpxord %ymm13,%ymm0,%ymm0
- leaq 16(%rcx),%rax
-L$vaesenc_loop_tail_full_vec__func1:
- vbroadcasti32x4 (%rax),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- addq $16,%rax
- cmpq %rax,%r11
- jne L$vaesenc_loop_tail_full_vec__func1
- vaesenclast %ymm14,%ymm0,%ymm0
-
-
- vmovdqu8 (%rdi),%ymm1
- vpxord %ymm1,%ymm0,%ymm0
- vmovdqu8 %ymm0,(%rsi)
-
-
- vmovdqu8 (%r8),%ymm30
- vpshufb %ymm8,%ymm0,%ymm0
- vpxord %ymm10,%ymm0,%ymm0
- vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
- vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
- vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
- vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
- vpxord %ymm7,%ymm4,%ymm4
- vpternlogd $0x96,%ymm2,%ymm1,%ymm5
- vpxord %ymm3,%ymm6,%ymm6
-
- vpxor %xmm10,%xmm10,%xmm10
-
- addq $32,%r8
- addq $32,%rdi
- addq $32,%rsi
- subq $32,%rdx
- cmpq $32,%rdx
- jae L$crypt_loop_1x__func1
-
- testq %rdx,%rdx
- jz L$reduce__func1
-
-L$partial_vec__func1:
-
-
-
-
- movq $-1,%rax
- bzhiq %rdx,%rax,%rax
- kmovd %eax,%k1
- addq $15,%rdx
- andq $-16,%rdx
- movq $-1,%rax
- bzhiq %rdx,%rax,%rax
- kmovd %eax,%k2
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpxord %ymm13,%ymm0,%ymm0
- leaq 16(%rcx),%rax
-L$vaesenc_loop_tail_partialvec__func1:
- vbroadcasti32x4 (%rax),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- addq $16,%rax
- cmpq %rax,%r11
- jne L$vaesenc_loop_tail_partialvec__func1
- vaesenclast %ymm14,%ymm0,%ymm0
-
-
- vmovdqu8 (%rdi),%ymm1{%k1}{z}
- vpxord %ymm1,%ymm0,%ymm0
- vmovdqu8 %ymm0,(%rsi){%k1}
-
-
-
-
-
-
-
-
-
-
-
-
-
- vmovdqu8 (%r8),%ymm30{%k2}{z}
- vmovdqu8 %ymm0,%ymm1{%k1}{z}
- vpshufb %ymm8,%ymm1,%ymm0
- vpxord %ymm10,%ymm0,%ymm0
- vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
- vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
- vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
- vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
- vpxord %ymm7,%ymm4,%ymm4
- vpternlogd $0x96,%ymm2,%ymm1,%ymm5
- vpxord %ymm3,%ymm6,%ymm6
-
-
-L$reduce__func1:
-
- vpclmulqdq $0x01,%ymm4,%ymm31,%ymm0
- vpshufd $0x4e,%ymm4,%ymm4
- vpternlogd $0x96,%ymm0,%ymm4,%ymm5
- vpclmulqdq $0x01,%ymm5,%ymm31,%ymm0
- vpshufd $0x4e,%ymm5,%ymm5
- vpternlogd $0x96,%ymm0,%ymm5,%ymm6
-
- vextracti32x4 $1,%ymm6,%xmm0
- vpxord %xmm0,%xmm6,%xmm10
-
-
-L$done__func1:
-
- vpshufb %xmm8,%xmm10,%xmm10
- vmovdqu %xmm10,(%r12)
-
- vzeroupper
- popq %r12
-
- ret
-
-
-
-.globl _aes_gcm_dec_update_vaes_avx10_256
-.private_extern _aes_gcm_dec_update_vaes_avx10_256
-
-.p2align 5
-_aes_gcm_dec_update_vaes_avx10_256:
-
-
-_CET_ENDBR
- pushq %r12
-
-
- movq 16(%rsp),%r12
-
- vbroadcasti32x4 L$bswap_mask(%rip),%ymm8
- vbroadcasti32x4 L$gfpoly(%rip),%ymm31
-
-
-
- vmovdqu (%r12),%xmm10
- vpshufb %xmm8,%xmm10,%xmm10
- vbroadcasti32x4 (%r8),%ymm12
- vpshufb %ymm8,%ymm12,%ymm12
-
-
-
- movl 240(%rcx),%r10d
- leal -20(,%r10,4),%r10d
-
-
-
-
- leaq 96(%rcx,%r10,4),%r11
- vbroadcasti32x4 (%rcx),%ymm13
- vbroadcasti32x4 (%r11),%ymm14
-
-
- vpaddd L$ctr_pattern(%rip),%ymm12,%ymm12
-
-
- vbroadcasti32x4 L$inc_2blocks(%rip),%ymm11
-
-
-
- cmpq $128-1,%rdx
- jbe L$crypt_loop_4x_done__func2
-
-
- vmovdqu8 256-128(%r9),%ymm27
- vmovdqu8 256-96(%r9),%ymm28
- vmovdqu8 256-64(%r9),%ymm29
- vmovdqu8 256-32(%r9),%ymm30
- vbroadcasti32x4 -144(%r11),%ymm15
- vbroadcasti32x4 -128(%r11),%ymm16
- vbroadcasti32x4 -112(%r11),%ymm17
- vbroadcasti32x4 -96(%r11),%ymm18
- vbroadcasti32x4 -80(%r11),%ymm19
- vbroadcasti32x4 -64(%r11),%ymm20
- vbroadcasti32x4 -48(%r11),%ymm21
- vbroadcasti32x4 -32(%r11),%ymm22
- vbroadcasti32x4 -16(%r11),%ymm23
-L$crypt_loop_4x__func2:
- vmovdqu8 0(%rdi),%ymm4
- vmovdqu8 32(%rdi),%ymm5
- vmovdqu8 64(%rdi),%ymm6
- vmovdqu8 96(%rdi),%ymm7
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm1
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm2
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm3
- vpaddd %ymm11,%ymm12,%ymm12
-
-
- vpxord %ymm13,%ymm0,%ymm0
- vpxord %ymm13,%ymm1,%ymm1
- vpxord %ymm13,%ymm2,%ymm2
- vpxord %ymm13,%ymm3,%ymm3
-
- cmpl $24,%r10d
- jl L$aes128__func2
- je L$aes192__func2
-
- vbroadcasti32x4 -208(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
- vbroadcasti32x4 -192(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
-L$aes192__func2:
- vbroadcasti32x4 -176(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
- vbroadcasti32x4 -160(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
-L$aes128__func2:
- vpshufb %ymm8,%ymm4,%ymm4
- vpxord %ymm10,%ymm4,%ymm4
- vpshufb %ymm8,%ymm5,%ymm5
- vpshufb %ymm8,%ymm6,%ymm6
-
- vaesenc %ymm15,%ymm0,%ymm0
- vaesenc %ymm15,%ymm1,%ymm1
- vaesenc %ymm15,%ymm2,%ymm2
- vaesenc %ymm15,%ymm3,%ymm3
-
- vpshufb %ymm8,%ymm7,%ymm7
- vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
- vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
- vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
-
- vaesenc %ymm16,%ymm0,%ymm0
- vaesenc %ymm16,%ymm1,%ymm1
- vaesenc %ymm16,%ymm2,%ymm2
- vaesenc %ymm16,%ymm3,%ymm3
-
- vpxord %ymm24,%ymm10,%ymm10
- vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm10
- vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
-
- vaesenc %ymm17,%ymm0,%ymm0
- vaesenc %ymm17,%ymm1,%ymm1
- vaesenc %ymm17,%ymm2,%ymm2
- vaesenc %ymm17,%ymm3,%ymm3
-
- vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
-
- vaesenc %ymm18,%ymm0,%ymm0
- vaesenc %ymm18,%ymm1,%ymm1
- vaesenc %ymm18,%ymm2,%ymm2
- vaesenc %ymm18,%ymm3,%ymm3
-
- vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
-
- vaesenc %ymm19,%ymm0,%ymm0
- vaesenc %ymm19,%ymm1,%ymm1
- vaesenc %ymm19,%ymm2,%ymm2
- vaesenc %ymm19,%ymm3,%ymm3
-
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
- vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
- vpxord %ymm25,%ymm24,%ymm24
-
- vaesenc %ymm20,%ymm0,%ymm0
- vaesenc %ymm20,%ymm1,%ymm1
- vaesenc %ymm20,%ymm2,%ymm2
- vaesenc %ymm20,%ymm3,%ymm3
-
- vpshufd $0x4e,%ymm10,%ymm10
- vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
- vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
- vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
-
- vaesenc %ymm21,%ymm0,%ymm0
- vaesenc %ymm21,%ymm1,%ymm1
- vaesenc %ymm21,%ymm2,%ymm2
- vaesenc %ymm21,%ymm3,%ymm3
-
- vpternlogd $0x96,%ymm26,%ymm10,%ymm24
- vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
- vpternlogd $0x96,%ymm6,%ymm5,%ymm4
- vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
-
- vaesenc %ymm22,%ymm0,%ymm0
- vaesenc %ymm22,%ymm1,%ymm1
- vaesenc %ymm22,%ymm2,%ymm2
- vaesenc %ymm22,%ymm3,%ymm3
-
- vpxord %ymm7,%ymm4,%ymm10
- vpshufd $0x4e,%ymm24,%ymm24
- vpternlogd $0x96,%ymm25,%ymm24,%ymm10
-
- vaesenc %ymm23,%ymm0,%ymm0
- vaesenc %ymm23,%ymm1,%ymm1
- vaesenc %ymm23,%ymm2,%ymm2
- vaesenc %ymm23,%ymm3,%ymm3
-
- vextracti32x4 $1,%ymm10,%xmm4
- vpxord %xmm4,%xmm10,%xmm10
-
-
-
-
- vpxord 0(%rdi),%ymm14,%ymm4
- vpxord 32(%rdi),%ymm14,%ymm5
- vpxord 64(%rdi),%ymm14,%ymm6
- vpxord 96(%rdi),%ymm14,%ymm7
-
-
-
- vaesenclast %ymm4,%ymm0,%ymm4
- vaesenclast %ymm5,%ymm1,%ymm5
- vaesenclast %ymm6,%ymm2,%ymm6
- vaesenclast %ymm7,%ymm3,%ymm7
-
-
- vmovdqu8 %ymm4,0(%rsi)
- vmovdqu8 %ymm5,32(%rsi)
- vmovdqu8 %ymm6,64(%rsi)
- vmovdqu8 %ymm7,96(%rsi)
-
- subq $-128,%rdi
- subq $-128,%rsi
- addq $-128,%rdx
- cmpq $128-1,%rdx
- ja L$crypt_loop_4x__func2
-L$crypt_loop_4x_done__func2:
-
- testq %rdx,%rdx
- jz L$done__func2
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- movq %rdx,%rax
- negq %rax
- andq $-16,%rax
- leaq 256(%r9,%rax,1),%r8
- vpxor %xmm4,%xmm4,%xmm4
- vpxor %xmm5,%xmm5,%xmm5
- vpxor %xmm6,%xmm6,%xmm6
-
- cmpq $32,%rdx
- jb L$partial_vec__func2
-
-L$crypt_loop_1x__func2:
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpaddd %ymm11,%ymm12,%ymm12
- vpxord %ymm13,%ymm0,%ymm0
- leaq 16(%rcx),%rax
-L$vaesenc_loop_tail_full_vec__func2:
- vbroadcasti32x4 (%rax),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- addq $16,%rax
- cmpq %rax,%r11
- jne L$vaesenc_loop_tail_full_vec__func2
- vaesenclast %ymm14,%ymm0,%ymm0
-
-
- vmovdqu8 (%rdi),%ymm1
- vpxord %ymm1,%ymm0,%ymm0
- vmovdqu8 %ymm0,(%rsi)
-
-
- vmovdqu8 (%r8),%ymm30
- vpshufb %ymm8,%ymm1,%ymm0
- vpxord %ymm10,%ymm0,%ymm0
- vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
- vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
- vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
- vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
- vpxord %ymm7,%ymm4,%ymm4
- vpternlogd $0x96,%ymm2,%ymm1,%ymm5
- vpxord %ymm3,%ymm6,%ymm6
-
- vpxor %xmm10,%xmm10,%xmm10
-
- addq $32,%r8
- addq $32,%rdi
- addq $32,%rsi
- subq $32,%rdx
- cmpq $32,%rdx
- jae L$crypt_loop_1x__func2
-
- testq %rdx,%rdx
- jz L$reduce__func2
-
-L$partial_vec__func2:
-
-
-
-
- movq $-1,%rax
- bzhiq %rdx,%rax,%rax
- kmovd %eax,%k1
- addq $15,%rdx
- andq $-16,%rdx
- movq $-1,%rax
- bzhiq %rdx,%rax,%rax
- kmovd %eax,%k2
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpxord %ymm13,%ymm0,%ymm0
- leaq 16(%rcx),%rax
-L$vaesenc_loop_tail_partialvec__func2:
- vbroadcasti32x4 (%rax),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- addq $16,%rax
- cmpq %rax,%r11
- jne L$vaesenc_loop_tail_partialvec__func2
- vaesenclast %ymm14,%ymm0,%ymm0
-
-
- vmovdqu8 (%rdi),%ymm1{%k1}{z}
- vpxord %ymm1,%ymm0,%ymm0
- vmovdqu8 %ymm0,(%rsi){%k1}
-
-
-
-
-
-
-
-
-
-
-
-
-
- vmovdqu8 (%r8),%ymm30{%k2}{z}
-
- vpshufb %ymm8,%ymm1,%ymm0
- vpxord %ymm10,%ymm0,%ymm0
- vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
- vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
- vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
- vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
- vpxord %ymm7,%ymm4,%ymm4
- vpternlogd $0x96,%ymm2,%ymm1,%ymm5
- vpxord %ymm3,%ymm6,%ymm6
-
-
-L$reduce__func2:
-
- vpclmulqdq $0x01,%ymm4,%ymm31,%ymm0
- vpshufd $0x4e,%ymm4,%ymm4
- vpternlogd $0x96,%ymm0,%ymm4,%ymm5
- vpclmulqdq $0x01,%ymm5,%ymm31,%ymm0
- vpshufd $0x4e,%ymm5,%ymm5
- vpternlogd $0x96,%ymm0,%ymm5,%ymm6
-
- vextracti32x4 $1,%ymm6,%xmm0
- vpxord %xmm0,%xmm6,%xmm10
-
-
-L$done__func2:
-
- vpshufb %xmm8,%xmm10,%xmm10
- vmovdqu %xmm10,(%r12)
-
- vzeroupper
- popq %r12
-
- ret
-
-
-
.globl _gcm_ghash_vpclmulqdq_avx10_512
.private_extern _gcm_ghash_vpclmulqdq_avx10_512
@@ -1227,7 +205,7 @@
cmpq $64,%rcx
- jb L$aad_blockbyblock__func2
+ jb L$aad_blockbyblock__func1
@@ -1238,7 +216,7 @@
vmovdqu8 256-64(%rsi),%zmm9
cmpq $256-1,%rcx
- jbe L$aad_loop_1x__func2
+ jbe L$aad_loop_1x__func1
vmovdqu8 256-256(%rsi),%zmm6
@@ -1246,7 +224,7 @@
vmovdqu8 256-128(%rsi),%zmm8
-L$aad_loop_4x__func2:
+L$aad_loop_4x__func1:
vmovdqu8 0(%rdx),%zmm0
vmovdqu8 64(%rdx),%zmm1
vmovdqu8 128(%rdx),%zmm2
@@ -1295,12 +273,12 @@
subq $-256,%rdx
addq $-256,%rcx
cmpq $256-1,%rcx
- ja L$aad_loop_4x__func2
+ ja L$aad_loop_4x__func1
cmpq $64,%rcx
- jb L$aad_large_done__func2
-L$aad_loop_1x__func2:
+ jb L$aad_large_done__func1
+L$aad_loop_1x__func1:
vmovdqu8 (%rdx),%zmm0
vpshufb %zmm4,%zmm0,%zmm0
vpxord %zmm0,%zmm5,%zmm5
@@ -1325,19 +303,19 @@
addq $64,%rdx
subq $64,%rcx
cmpq $64,%rcx
- jae L$aad_loop_1x__func2
+ jae L$aad_loop_1x__func1
-L$aad_large_done__func2:
+L$aad_large_done__func1:
vzeroupper
-L$aad_blockbyblock__func2:
+L$aad_blockbyblock__func1:
testq %rcx,%rcx
- jz L$aad_done__func2
+ jz L$aad_done__func1
vmovdqu 256-16(%rsi),%xmm9
-L$aad_loop_blockbyblock__func2:
+L$aad_loop_blockbyblock__func1:
vmovdqu (%rdx),%xmm0
vpshufb %xmm4,%xmm0,%xmm0
vpxor %xmm0,%xmm5,%xmm5
@@ -1355,9 +333,9 @@
addq $16,%rdx
subq $16,%rcx
- jnz L$aad_loop_blockbyblock__func2
+ jnz L$aad_loop_blockbyblock__func1
-L$aad_done__func2:
+L$aad_done__func1:
vpshufb %xmm4,%xmm5,%xmm5
vmovdqu %xmm5,(%rdi)
@@ -1413,7 +391,7 @@
cmpq $256-1,%rdx
- jbe L$crypt_loop_4x_done__func3
+ jbe L$crypt_loop_4x_done__func1
vmovdqu8 256-256(%r9),%zmm27
@@ -1440,7 +418,7 @@
vpxord %zmm13,%zmm3,%zmm3
leaq 16(%rcx),%rax
-L$vaesenc_loop_first_4_vecs__func3:
+L$vaesenc_loop_first_4_vecs__func1:
vbroadcasti32x4 (%rax),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
vaesenc %zmm9,%zmm1,%zmm1
@@ -1449,7 +427,7 @@
addq $16,%rax
cmpq %rax,%r11
- jne L$vaesenc_loop_first_4_vecs__func3
+ jne L$vaesenc_loop_first_4_vecs__func1
@@ -1475,7 +453,7 @@
subq $-256,%rsi
addq $-256,%rdx
cmpq $256-1,%rdx
- jbe L$ghash_last_ciphertext_4x__func3
+ jbe L$ghash_last_ciphertext_4x__func1
vbroadcasti32x4 -144(%r11),%zmm15
vbroadcasti32x4 -128(%r11),%zmm16
vbroadcasti32x4 -112(%r11),%zmm17
@@ -1485,7 +463,7 @@
vbroadcasti32x4 -48(%r11),%zmm21
vbroadcasti32x4 -32(%r11),%zmm22
vbroadcasti32x4 -16(%r11),%zmm23
-L$crypt_loop_4x__func3:
+L$crypt_loop_4x__func1:
@@ -1505,8 +483,8 @@
vpxord %zmm13,%zmm3,%zmm3
cmpl $24,%r10d
- jl L$aes128__func3
- je L$aes192__func3
+ jl L$aes128__func1
+ je L$aes192__func1
vbroadcasti32x4 -208(%r11),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
@@ -1520,7 +498,7 @@
vaesenc %zmm9,%zmm2,%zmm2
vaesenc %zmm9,%zmm3,%zmm3
-L$aes192__func3:
+L$aes192__func1:
vbroadcasti32x4 -176(%r11),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
vaesenc %zmm9,%zmm1,%zmm1
@@ -1533,7 +511,7 @@
vaesenc %zmm9,%zmm2,%zmm2
vaesenc %zmm9,%zmm3,%zmm3
-L$aes128__func3:
+L$aes128__func1:
vpshufb %zmm8,%zmm4,%zmm4
vpxord %zmm10,%zmm4,%zmm4
vpshufb %zmm8,%zmm5,%zmm5
@@ -1654,8 +632,8 @@
subq $-256,%rsi
addq $-256,%rdx
cmpq $256-1,%rdx
- ja L$crypt_loop_4x__func3
-L$ghash_last_ciphertext_4x__func3:
+ ja L$crypt_loop_4x__func1
+L$ghash_last_ciphertext_4x__func1:
vpshufb %zmm8,%zmm4,%zmm4
vpxord %zmm10,%zmm4,%zmm4
vpshufb %zmm8,%zmm5,%zmm5
@@ -1697,10 +675,10 @@
vpxord %xmm4,%xmm10,%xmm10
vpternlogd $0x96,%xmm5,%xmm6,%xmm10
-L$crypt_loop_4x_done__func3:
+L$crypt_loop_4x_done__func1:
testq %rdx,%rdx
- jz L$done__func3
+ jz L$done__func1
@@ -1730,9 +708,9 @@
vpxor %xmm6,%xmm6,%xmm6
cmpq $64,%rdx
- jb L$partial_vec__func3
+ jb L$partial_vec__func1
-L$crypt_loop_1x__func3:
+L$crypt_loop_1x__func1:
@@ -1740,12 +718,12 @@
vpaddd %zmm11,%zmm12,%zmm12
vpxord %zmm13,%zmm0,%zmm0
leaq 16(%rcx),%rax
-L$vaesenc_loop_tail_full_vec__func3:
+L$vaesenc_loop_tail_full_vec__func1:
vbroadcasti32x4 (%rax),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
addq $16,%rax
cmpq %rax,%r11
- jne L$vaesenc_loop_tail_full_vec__func3
+ jne L$vaesenc_loop_tail_full_vec__func1
vaesenclast %zmm14,%zmm0,%zmm0
@@ -1772,12 +750,12 @@
addq $64,%rsi
subq $64,%rdx
cmpq $64,%rdx
- jae L$crypt_loop_1x__func3
+ jae L$crypt_loop_1x__func1
testq %rdx,%rdx
- jz L$reduce__func3
+ jz L$reduce__func1
-L$partial_vec__func3:
+L$partial_vec__func1:
@@ -1796,12 +774,12 @@
vpshufb %zmm8,%zmm12,%zmm0
vpxord %zmm13,%zmm0,%zmm0
leaq 16(%rcx),%rax
-L$vaesenc_loop_tail_partialvec__func3:
+L$vaesenc_loop_tail_partialvec__func1:
vbroadcasti32x4 (%rax),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
addq $16,%rax
cmpq %rax,%r11
- jne L$vaesenc_loop_tail_partialvec__func3
+ jne L$vaesenc_loop_tail_partialvec__func1
vaesenclast %zmm14,%zmm0,%zmm0
@@ -1834,7 +812,7 @@
vpxord %zmm3,%zmm6,%zmm6
-L$reduce__func3:
+L$reduce__func1:
vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0
vpshufd $0x4e,%zmm4,%zmm4
@@ -1850,7 +828,7 @@
vpternlogd $0x96,%xmm1,%xmm2,%xmm10
-L$done__func3:
+L$done__func1:
vpshufb %xmm8,%xmm10,%xmm10
vmovdqu %xmm10,(%r12)
@@ -1906,7 +884,7 @@
cmpq $256-1,%rdx
- jbe L$crypt_loop_4x_done__func4
+ jbe L$crypt_loop_4x_done__func2
vmovdqu8 256-256(%r9),%zmm27
@@ -1922,7 +900,7 @@
vbroadcasti32x4 -48(%r11),%zmm21
vbroadcasti32x4 -32(%r11),%zmm22
vbroadcasti32x4 -16(%r11),%zmm23
-L$crypt_loop_4x__func4:
+L$crypt_loop_4x__func2:
vmovdqu8 0(%rdi),%zmm4
vmovdqu8 64(%rdi),%zmm5
vmovdqu8 128(%rdi),%zmm6
@@ -1946,8 +924,8 @@
vpxord %zmm13,%zmm3,%zmm3
cmpl $24,%r10d
- jl L$aes128__func4
- je L$aes192__func4
+ jl L$aes128__func2
+ je L$aes192__func2
vbroadcasti32x4 -208(%r11),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
@@ -1961,7 +939,7 @@
vaesenc %zmm9,%zmm2,%zmm2
vaesenc %zmm9,%zmm3,%zmm3
-L$aes192__func4:
+L$aes192__func2:
vbroadcasti32x4 -176(%r11),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
vaesenc %zmm9,%zmm1,%zmm1
@@ -1974,7 +952,7 @@
vaesenc %zmm9,%zmm2,%zmm2
vaesenc %zmm9,%zmm3,%zmm3
-L$aes128__func4:
+L$aes128__func2:
vpshufb %zmm8,%zmm4,%zmm4
vpxord %zmm10,%zmm4,%zmm4
vpshufb %zmm8,%zmm5,%zmm5
@@ -2095,11 +1073,11 @@
subq $-256,%rsi
addq $-256,%rdx
cmpq $256-1,%rdx
- ja L$crypt_loop_4x__func4
-L$crypt_loop_4x_done__func4:
+ ja L$crypt_loop_4x__func2
+L$crypt_loop_4x_done__func2:
testq %rdx,%rdx
- jz L$done__func4
+ jz L$done__func2
@@ -2129,9 +1107,9 @@
vpxor %xmm6,%xmm6,%xmm6
cmpq $64,%rdx
- jb L$partial_vec__func4
+ jb L$partial_vec__func2
-L$crypt_loop_1x__func4:
+L$crypt_loop_1x__func2:
@@ -2139,12 +1117,12 @@
vpaddd %zmm11,%zmm12,%zmm12
vpxord %zmm13,%zmm0,%zmm0
leaq 16(%rcx),%rax
-L$vaesenc_loop_tail_full_vec__func4:
+L$vaesenc_loop_tail_full_vec__func2:
vbroadcasti32x4 (%rax),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
addq $16,%rax
cmpq %rax,%r11
- jne L$vaesenc_loop_tail_full_vec__func4
+ jne L$vaesenc_loop_tail_full_vec__func2
vaesenclast %zmm14,%zmm0,%zmm0
@@ -2171,12 +1149,12 @@
addq $64,%rsi
subq $64,%rdx
cmpq $64,%rdx
- jae L$crypt_loop_1x__func4
+ jae L$crypt_loop_1x__func2
testq %rdx,%rdx
- jz L$reduce__func4
+ jz L$reduce__func2
-L$partial_vec__func4:
+L$partial_vec__func2:
@@ -2195,12 +1173,12 @@
vpshufb %zmm8,%zmm12,%zmm0
vpxord %zmm13,%zmm0,%zmm0
leaq 16(%rcx),%rax
-L$vaesenc_loop_tail_partialvec__func4:
+L$vaesenc_loop_tail_partialvec__func2:
vbroadcasti32x4 (%rax),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
addq $16,%rax
cmpq %rax,%r11
- jne L$vaesenc_loop_tail_partialvec__func4
+ jne L$vaesenc_loop_tail_partialvec__func2
vaesenclast %zmm14,%zmm0,%zmm0
@@ -2233,7 +1211,7 @@
vpxord %zmm3,%zmm6,%zmm6
-L$reduce__func4:
+L$reduce__func2:
vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0
vpshufd $0x4e,%zmm4,%zmm4
@@ -2249,7 +1227,7 @@
vpternlogd $0x96,%xmm1,%xmm2,%xmm10
-L$done__func4:
+L$done__func2:
vpshufb %xmm8,%xmm10,%xmm10
vmovdqu %xmm10,(%r12)
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-linux.S b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
index cf661c8..2be6a8c 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-linux.S
+++ b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
@@ -75,16 +75,16 @@
.cfi_endproc
.size gcm_gmult_vpclmulqdq_avx10, . - gcm_gmult_vpclmulqdq_avx10
-.globl gcm_init_vpclmulqdq_avx10
-.hidden gcm_init_vpclmulqdq_avx10
-.type gcm_init_vpclmulqdq_avx10,@function
+.globl gcm_init_vpclmulqdq_avx10_512
+.hidden gcm_init_vpclmulqdq_avx10_512
+.type gcm_init_vpclmulqdq_avx10_512,@function
.align 32
-gcm_init_vpclmulqdq_avx10:
+gcm_init_vpclmulqdq_avx10_512:
.cfi_startproc
_CET_ENDBR
- leaq 256-32(%rdi),%r8
+ leaq 256-64(%rdi),%r8
@@ -112,7 +112,7 @@
vpternlogd $0x78,.Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm3
- vbroadcasti32x4 .Lgfpoly(%rip),%ymm5
+ vbroadcasti32x4 .Lgfpoly(%rip),%zmm5
@@ -137,16 +137,6 @@
vinserti128 $1,%xmm3,%ymm4,%ymm3
vinserti128 $1,%xmm4,%ymm4,%ymm4
-
- vmovdqu8 %ymm3,(%r8)
-
-
-
-
-
- movl $7,%eax
-.Lprecompute_next__func1:
- subq $32,%r8
vpclmulqdq $0x00,%ymm4,%ymm3,%ymm0
vpclmulqdq $0x01,%ymm4,%ymm3,%ymm1
vpclmulqdq $0x10,%ymm4,%ymm3,%ymm2
@@ -154,12 +144,36 @@
vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2
vpshufd $0x4e,%ymm0,%ymm0
vpternlogd $0x96,%ymm2,%ymm0,%ymm1
- vpclmulqdq $0x11,%ymm4,%ymm3,%ymm3
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm4
vpclmulqdq $0x01,%ymm1,%ymm5,%ymm0
vpshufd $0x4e,%ymm1,%ymm1
- vpternlogd $0x96,%ymm0,%ymm1,%ymm3
+ vpternlogd $0x96,%ymm0,%ymm1,%ymm4
- vmovdqu8 %ymm3,(%r8)
+ vinserti64x4 $1,%ymm3,%zmm4,%zmm3
+ vshufi64x2 $0,%zmm4,%zmm4,%zmm4
+
+ vmovdqu8 %zmm3,(%r8)
+
+
+
+
+
+ movl $3,%eax
+.Lprecompute_next__func1:
+ subq $64,%r8
+ vpclmulqdq $0x00,%zmm4,%zmm3,%zmm0
+ vpclmulqdq $0x01,%zmm4,%zmm3,%zmm1
+ vpclmulqdq $0x10,%zmm4,%zmm3,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm0,%zmm5,%zmm2
+ vpshufd $0x4e,%zmm0,%zmm0
+ vpternlogd $0x96,%zmm2,%zmm0,%zmm1
+ vpclmulqdq $0x11,%zmm4,%zmm3,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm5,%zmm0
+ vpshufd $0x4e,%zmm1,%zmm1
+ vpternlogd $0x96,%zmm0,%zmm1,%zmm3
+
+ vmovdqu8 %zmm3,(%r8)
decl %eax
jnz .Lprecompute_next__func1
@@ -167,1048 +181,7 @@
ret
.cfi_endproc
-.size gcm_init_vpclmulqdq_avx10, . - gcm_init_vpclmulqdq_avx10
-.globl gcm_ghash_vpclmulqdq_avx10_256
-.hidden gcm_ghash_vpclmulqdq_avx10_256
-.type gcm_ghash_vpclmulqdq_avx10_256,@function
-.align 32
-gcm_ghash_vpclmulqdq_avx10_256:
-.cfi_startproc
-
-_CET_ENDBR
-
-
-
-
-
-
- vmovdqu .Lbswap_mask(%rip),%xmm4
- vmovdqu .Lgfpoly(%rip),%xmm10
-
-
- vmovdqu (%rdi),%xmm5
- vpshufb %xmm4,%xmm5,%xmm5
-
-
- cmpq $32,%rcx
- jb .Laad_blockbyblock__func1
-
-
-
- vshufi64x2 $0,%ymm4,%ymm4,%ymm4
- vshufi64x2 $0,%ymm10,%ymm10,%ymm10
-
-
- vmovdqu8 256-32(%rsi),%ymm9
-
- cmpq $128-1,%rcx
- jbe .Laad_loop_1x__func1
-
-
- vmovdqu8 256-128(%rsi),%ymm6
- vmovdqu8 256-96(%rsi),%ymm7
- vmovdqu8 256-64(%rsi),%ymm8
-
-
-.Laad_loop_4x__func1:
- vmovdqu8 0(%rdx),%ymm0
- vmovdqu8 32(%rdx),%ymm1
- vmovdqu8 64(%rdx),%ymm2
- vmovdqu8 96(%rdx),%ymm3
- vpshufb %ymm4,%ymm0,%ymm0
- vpxord %ymm5,%ymm0,%ymm0
- vpshufb %ymm4,%ymm1,%ymm1
- vpshufb %ymm4,%ymm2,%ymm2
- vpshufb %ymm4,%ymm3,%ymm3
- vpclmulqdq $0x00,%ymm6,%ymm0,%ymm5
- vpclmulqdq $0x00,%ymm7,%ymm1,%ymm11
- vpclmulqdq $0x00,%ymm8,%ymm2,%ymm12
- vpxord %ymm11,%ymm5,%ymm5
- vpclmulqdq $0x00,%ymm9,%ymm3,%ymm13
- vpternlogd $0x96,%ymm13,%ymm12,%ymm5
- vpclmulqdq $0x01,%ymm6,%ymm0,%ymm11
- vpclmulqdq $0x01,%ymm7,%ymm1,%ymm12
- vpclmulqdq $0x01,%ymm8,%ymm2,%ymm13
- vpternlogd $0x96,%ymm13,%ymm12,%ymm11
- vpclmulqdq $0x01,%ymm9,%ymm3,%ymm12
- vpclmulqdq $0x10,%ymm6,%ymm0,%ymm13
- vpternlogd $0x96,%ymm13,%ymm12,%ymm11
- vpclmulqdq $0x10,%ymm7,%ymm1,%ymm12
- vpclmulqdq $0x10,%ymm8,%ymm2,%ymm13
- vpternlogd $0x96,%ymm13,%ymm12,%ymm11
- vpclmulqdq $0x01,%ymm5,%ymm10,%ymm13
- vpclmulqdq $0x10,%ymm9,%ymm3,%ymm12
- vpxord %ymm12,%ymm11,%ymm11
- vpshufd $0x4e,%ymm5,%ymm5
- vpclmulqdq $0x11,%ymm6,%ymm0,%ymm0
- vpclmulqdq $0x11,%ymm7,%ymm1,%ymm1
- vpclmulqdq $0x11,%ymm8,%ymm2,%ymm2
- vpternlogd $0x96,%ymm13,%ymm5,%ymm11
- vpclmulqdq $0x11,%ymm9,%ymm3,%ymm3
- vpternlogd $0x96,%ymm2,%ymm1,%ymm0
- vpclmulqdq $0x01,%ymm11,%ymm10,%ymm12
- vpxord %ymm3,%ymm0,%ymm5
- vpshufd $0x4e,%ymm11,%ymm11
- vpternlogd $0x96,%ymm12,%ymm11,%ymm5
- vextracti32x4 $1,%ymm5,%xmm0
- vpxord %xmm0,%xmm5,%xmm5
-
- subq $-128,%rdx
- addq $-128,%rcx
- cmpq $128-1,%rcx
- ja .Laad_loop_4x__func1
-
-
- cmpq $32,%rcx
- jb .Laad_large_done__func1
-.Laad_loop_1x__func1:
- vmovdqu8 (%rdx),%ymm0
- vpshufb %ymm4,%ymm0,%ymm0
- vpxord %ymm0,%ymm5,%ymm5
- vpclmulqdq $0x00,%ymm9,%ymm5,%ymm0
- vpclmulqdq $0x01,%ymm9,%ymm5,%ymm1
- vpclmulqdq $0x10,%ymm9,%ymm5,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vpclmulqdq $0x01,%ymm0,%ymm10,%ymm2
- vpshufd $0x4e,%ymm0,%ymm0
- vpternlogd $0x96,%ymm2,%ymm0,%ymm1
- vpclmulqdq $0x11,%ymm9,%ymm5,%ymm5
- vpclmulqdq $0x01,%ymm1,%ymm10,%ymm0
- vpshufd $0x4e,%ymm1,%ymm1
- vpternlogd $0x96,%ymm0,%ymm1,%ymm5
-
- vextracti32x4 $1,%ymm5,%xmm0
- vpxord %xmm0,%xmm5,%xmm5
-
- addq $32,%rdx
- subq $32,%rcx
- cmpq $32,%rcx
- jae .Laad_loop_1x__func1
-
-.Laad_large_done__func1:
-
-
- vzeroupper
-
-
-.Laad_blockbyblock__func1:
- testq %rcx,%rcx
- jz .Laad_done__func1
- vmovdqu 256-16(%rsi),%xmm9
-.Laad_loop_blockbyblock__func1:
- vmovdqu (%rdx),%xmm0
- vpshufb %xmm4,%xmm0,%xmm0
- vpxor %xmm0,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm9,%xmm5,%xmm0
- vpclmulqdq $0x01,%xmm9,%xmm5,%xmm1
- vpclmulqdq $0x10,%xmm9,%xmm5,%xmm2
- vpxord %xmm2,%xmm1,%xmm1
- vpclmulqdq $0x01,%xmm0,%xmm10,%xmm2
- vpshufd $0x4e,%xmm0,%xmm0
- vpternlogd $0x96,%xmm2,%xmm0,%xmm1
- vpclmulqdq $0x11,%xmm9,%xmm5,%xmm5
- vpclmulqdq $0x01,%xmm1,%xmm10,%xmm0
- vpshufd $0x4e,%xmm1,%xmm1
- vpternlogd $0x96,%xmm0,%xmm1,%xmm5
-
- addq $16,%rdx
- subq $16,%rcx
- jnz .Laad_loop_blockbyblock__func1
-
-.Laad_done__func1:
-
- vpshufb %xmm4,%xmm5,%xmm5
- vmovdqu %xmm5,(%rdi)
- ret
-
-.cfi_endproc
-.size gcm_ghash_vpclmulqdq_avx10_256, . - gcm_ghash_vpclmulqdq_avx10_256
-.globl aes_gcm_enc_update_vaes_avx10_256
-.hidden aes_gcm_enc_update_vaes_avx10_256
-.type aes_gcm_enc_update_vaes_avx10_256,@function
-.align 32
-aes_gcm_enc_update_vaes_avx10_256:
-.cfi_startproc
-
-_CET_ENDBR
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-16
-
- movq 16(%rsp),%r12
-#ifdef BORINGSSL_DISPATCH_TEST
-.extern BORINGSSL_function_hit
-.hidden BORINGSSL_function_hit
- movb $1,BORINGSSL_function_hit+6(%rip)
-#endif
-
- vbroadcasti32x4 .Lbswap_mask(%rip),%ymm8
- vbroadcasti32x4 .Lgfpoly(%rip),%ymm31
-
-
-
- vmovdqu (%r12),%xmm10
- vpshufb %xmm8,%xmm10,%xmm10
- vbroadcasti32x4 (%r8),%ymm12
- vpshufb %ymm8,%ymm12,%ymm12
-
-
-
- movl 240(%rcx),%r10d
- leal -20(,%r10,4),%r10d
-
-
-
-
- leaq 96(%rcx,%r10,4),%r11
- vbroadcasti32x4 (%rcx),%ymm13
- vbroadcasti32x4 (%r11),%ymm14
-
-
- vpaddd .Lctr_pattern(%rip),%ymm12,%ymm12
-
-
- vbroadcasti32x4 .Linc_2blocks(%rip),%ymm11
-
-
-
- cmpq $128-1,%rdx
- jbe .Lcrypt_loop_4x_done__func1
-
-
- vmovdqu8 256-128(%r9),%ymm27
- vmovdqu8 256-96(%r9),%ymm28
- vmovdqu8 256-64(%r9),%ymm29
- vmovdqu8 256-32(%r9),%ymm30
-
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm1
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm2
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm3
- vpaddd %ymm11,%ymm12,%ymm12
-
-
- vpxord %ymm13,%ymm0,%ymm0
- vpxord %ymm13,%ymm1,%ymm1
- vpxord %ymm13,%ymm2,%ymm2
- vpxord %ymm13,%ymm3,%ymm3
-
- leaq 16(%rcx),%rax
-.Lvaesenc_loop_first_4_vecs__func1:
- vbroadcasti32x4 (%rax),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
- addq $16,%rax
- cmpq %rax,%r11
- jne .Lvaesenc_loop_first_4_vecs__func1
-
-
-
- vpxord 0(%rdi),%ymm14,%ymm4
- vpxord 32(%rdi),%ymm14,%ymm5
- vpxord 64(%rdi),%ymm14,%ymm6
- vpxord 96(%rdi),%ymm14,%ymm7
-
-
-
- vaesenclast %ymm4,%ymm0,%ymm4
- vaesenclast %ymm5,%ymm1,%ymm5
- vaesenclast %ymm6,%ymm2,%ymm6
- vaesenclast %ymm7,%ymm3,%ymm7
-
-
- vmovdqu8 %ymm4,0(%rsi)
- vmovdqu8 %ymm5,32(%rsi)
- vmovdqu8 %ymm6,64(%rsi)
- vmovdqu8 %ymm7,96(%rsi)
-
- subq $-128,%rdi
- subq $-128,%rsi
- addq $-128,%rdx
- cmpq $128-1,%rdx
- jbe .Lghash_last_ciphertext_4x__func1
- vbroadcasti32x4 -144(%r11),%ymm15
- vbroadcasti32x4 -128(%r11),%ymm16
- vbroadcasti32x4 -112(%r11),%ymm17
- vbroadcasti32x4 -96(%r11),%ymm18
- vbroadcasti32x4 -80(%r11),%ymm19
- vbroadcasti32x4 -64(%r11),%ymm20
- vbroadcasti32x4 -48(%r11),%ymm21
- vbroadcasti32x4 -32(%r11),%ymm22
- vbroadcasti32x4 -16(%r11),%ymm23
-.Lcrypt_loop_4x__func1:
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm1
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm2
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm3
- vpaddd %ymm11,%ymm12,%ymm12
-
-
- vpxord %ymm13,%ymm0,%ymm0
- vpxord %ymm13,%ymm1,%ymm1
- vpxord %ymm13,%ymm2,%ymm2
- vpxord %ymm13,%ymm3,%ymm3
-
- cmpl $24,%r10d
- jl .Laes128__func1
- je .Laes192__func1
-
- vbroadcasti32x4 -208(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
- vbroadcasti32x4 -192(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
-.Laes192__func1:
- vbroadcasti32x4 -176(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
- vbroadcasti32x4 -160(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
-.Laes128__func1:
- vpshufb %ymm8,%ymm4,%ymm4
- vpxord %ymm10,%ymm4,%ymm4
- vpshufb %ymm8,%ymm5,%ymm5
- vpshufb %ymm8,%ymm6,%ymm6
-
- vaesenc %ymm15,%ymm0,%ymm0
- vaesenc %ymm15,%ymm1,%ymm1
- vaesenc %ymm15,%ymm2,%ymm2
- vaesenc %ymm15,%ymm3,%ymm3
-
- vpshufb %ymm8,%ymm7,%ymm7
- vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
- vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
- vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
-
- vaesenc %ymm16,%ymm0,%ymm0
- vaesenc %ymm16,%ymm1,%ymm1
- vaesenc %ymm16,%ymm2,%ymm2
- vaesenc %ymm16,%ymm3,%ymm3
-
- vpxord %ymm24,%ymm10,%ymm10
- vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm10
- vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
-
- vaesenc %ymm17,%ymm0,%ymm0
- vaesenc %ymm17,%ymm1,%ymm1
- vaesenc %ymm17,%ymm2,%ymm2
- vaesenc %ymm17,%ymm3,%ymm3
-
- vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
-
- vaesenc %ymm18,%ymm0,%ymm0
- vaesenc %ymm18,%ymm1,%ymm1
- vaesenc %ymm18,%ymm2,%ymm2
- vaesenc %ymm18,%ymm3,%ymm3
-
- vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
-
- vaesenc %ymm19,%ymm0,%ymm0
- vaesenc %ymm19,%ymm1,%ymm1
- vaesenc %ymm19,%ymm2,%ymm2
- vaesenc %ymm19,%ymm3,%ymm3
-
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
- vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
- vpxord %ymm25,%ymm24,%ymm24
-
- vaesenc %ymm20,%ymm0,%ymm0
- vaesenc %ymm20,%ymm1,%ymm1
- vaesenc %ymm20,%ymm2,%ymm2
- vaesenc %ymm20,%ymm3,%ymm3
-
- vpshufd $0x4e,%ymm10,%ymm10
- vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
- vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
- vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
-
- vaesenc %ymm21,%ymm0,%ymm0
- vaesenc %ymm21,%ymm1,%ymm1
- vaesenc %ymm21,%ymm2,%ymm2
- vaesenc %ymm21,%ymm3,%ymm3
-
- vpternlogd $0x96,%ymm26,%ymm10,%ymm24
- vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
- vpternlogd $0x96,%ymm6,%ymm5,%ymm4
- vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
-
- vaesenc %ymm22,%ymm0,%ymm0
- vaesenc %ymm22,%ymm1,%ymm1
- vaesenc %ymm22,%ymm2,%ymm2
- vaesenc %ymm22,%ymm3,%ymm3
-
- vpxord %ymm7,%ymm4,%ymm10
- vpshufd $0x4e,%ymm24,%ymm24
- vpternlogd $0x96,%ymm25,%ymm24,%ymm10
-
- vaesenc %ymm23,%ymm0,%ymm0
- vaesenc %ymm23,%ymm1,%ymm1
- vaesenc %ymm23,%ymm2,%ymm2
- vaesenc %ymm23,%ymm3,%ymm3
-
- vextracti32x4 $1,%ymm10,%xmm4
- vpxord %xmm4,%xmm10,%xmm10
-
-
-
-
- vpxord 0(%rdi),%ymm14,%ymm4
- vpxord 32(%rdi),%ymm14,%ymm5
- vpxord 64(%rdi),%ymm14,%ymm6
- vpxord 96(%rdi),%ymm14,%ymm7
-
-
-
- vaesenclast %ymm4,%ymm0,%ymm4
- vaesenclast %ymm5,%ymm1,%ymm5
- vaesenclast %ymm6,%ymm2,%ymm6
- vaesenclast %ymm7,%ymm3,%ymm7
-
-
- vmovdqu8 %ymm4,0(%rsi)
- vmovdqu8 %ymm5,32(%rsi)
- vmovdqu8 %ymm6,64(%rsi)
- vmovdqu8 %ymm7,96(%rsi)
-
- subq $-128,%rdi
- subq $-128,%rsi
- addq $-128,%rdx
- cmpq $128-1,%rdx
- ja .Lcrypt_loop_4x__func1
-.Lghash_last_ciphertext_4x__func1:
- vpshufb %ymm8,%ymm4,%ymm4
- vpxord %ymm10,%ymm4,%ymm4
- vpshufb %ymm8,%ymm5,%ymm5
- vpshufb %ymm8,%ymm6,%ymm6
- vpshufb %ymm8,%ymm7,%ymm7
- vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
- vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
- vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
- vpxord %ymm24,%ymm10,%ymm10
- vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm10
- vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
- vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
- vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
- vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
- vpxord %ymm25,%ymm24,%ymm24
- vpshufd $0x4e,%ymm10,%ymm10
- vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
- vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
- vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
- vpternlogd $0x96,%ymm26,%ymm10,%ymm24
- vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
- vpternlogd $0x96,%ymm6,%ymm5,%ymm4
- vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
- vpxord %ymm7,%ymm4,%ymm10
- vpshufd $0x4e,%ymm24,%ymm24
- vpternlogd $0x96,%ymm25,%ymm24,%ymm10
- vextracti32x4 $1,%ymm10,%xmm4
- vpxord %xmm4,%xmm10,%xmm10
-
-.Lcrypt_loop_4x_done__func1:
-
- testq %rdx,%rdx
- jz .Ldone__func1
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- movq %rdx,%rax
- negq %rax
- andq $-16,%rax
- leaq 256(%r9,%rax,1),%r8
- vpxor %xmm4,%xmm4,%xmm4
- vpxor %xmm5,%xmm5,%xmm5
- vpxor %xmm6,%xmm6,%xmm6
-
- cmpq $32,%rdx
- jb .Lpartial_vec__func1
-
-.Lcrypt_loop_1x__func1:
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpaddd %ymm11,%ymm12,%ymm12
- vpxord %ymm13,%ymm0,%ymm0
- leaq 16(%rcx),%rax
-.Lvaesenc_loop_tail_full_vec__func1:
- vbroadcasti32x4 (%rax),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- addq $16,%rax
- cmpq %rax,%r11
- jne .Lvaesenc_loop_tail_full_vec__func1
- vaesenclast %ymm14,%ymm0,%ymm0
-
-
- vmovdqu8 (%rdi),%ymm1
- vpxord %ymm1,%ymm0,%ymm0
- vmovdqu8 %ymm0,(%rsi)
-
-
- vmovdqu8 (%r8),%ymm30
- vpshufb %ymm8,%ymm0,%ymm0
- vpxord %ymm10,%ymm0,%ymm0
- vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
- vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
- vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
- vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
- vpxord %ymm7,%ymm4,%ymm4
- vpternlogd $0x96,%ymm2,%ymm1,%ymm5
- vpxord %ymm3,%ymm6,%ymm6
-
- vpxor %xmm10,%xmm10,%xmm10
-
- addq $32,%r8
- addq $32,%rdi
- addq $32,%rsi
- subq $32,%rdx
- cmpq $32,%rdx
- jae .Lcrypt_loop_1x__func1
-
- testq %rdx,%rdx
- jz .Lreduce__func1
-
-.Lpartial_vec__func1:
-
-
-
-
- movq $-1,%rax
- bzhiq %rdx,%rax,%rax
- kmovd %eax,%k1
- addq $15,%rdx
- andq $-16,%rdx
- movq $-1,%rax
- bzhiq %rdx,%rax,%rax
- kmovd %eax,%k2
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpxord %ymm13,%ymm0,%ymm0
- leaq 16(%rcx),%rax
-.Lvaesenc_loop_tail_partialvec__func1:
- vbroadcasti32x4 (%rax),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- addq $16,%rax
- cmpq %rax,%r11
- jne .Lvaesenc_loop_tail_partialvec__func1
- vaesenclast %ymm14,%ymm0,%ymm0
-
-
- vmovdqu8 (%rdi),%ymm1{%k1}{z}
- vpxord %ymm1,%ymm0,%ymm0
- vmovdqu8 %ymm0,(%rsi){%k1}
-
-
-
-
-
-
-
-
-
-
-
-
-
- vmovdqu8 (%r8),%ymm30{%k2}{z}
- vmovdqu8 %ymm0,%ymm1{%k1}{z}
- vpshufb %ymm8,%ymm1,%ymm0
- vpxord %ymm10,%ymm0,%ymm0
- vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
- vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
- vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
- vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
- vpxord %ymm7,%ymm4,%ymm4
- vpternlogd $0x96,%ymm2,%ymm1,%ymm5
- vpxord %ymm3,%ymm6,%ymm6
-
-
-.Lreduce__func1:
-
- vpclmulqdq $0x01,%ymm4,%ymm31,%ymm0
- vpshufd $0x4e,%ymm4,%ymm4
- vpternlogd $0x96,%ymm0,%ymm4,%ymm5
- vpclmulqdq $0x01,%ymm5,%ymm31,%ymm0
- vpshufd $0x4e,%ymm5,%ymm5
- vpternlogd $0x96,%ymm0,%ymm5,%ymm6
-
- vextracti32x4 $1,%ymm6,%xmm0
- vpxord %xmm0,%xmm6,%xmm10
-
-
-.Ldone__func1:
-
- vpshufb %xmm8,%xmm10,%xmm10
- vmovdqu %xmm10,(%r12)
-
- vzeroupper
- popq %r12
-.cfi_adjust_cfa_offset -8
-.cfi_restore %r12
- ret
-
-.cfi_endproc
-.size aes_gcm_enc_update_vaes_avx10_256, . - aes_gcm_enc_update_vaes_avx10_256
-.globl aes_gcm_dec_update_vaes_avx10_256
-.hidden aes_gcm_dec_update_vaes_avx10_256
-.type aes_gcm_dec_update_vaes_avx10_256,@function
-.align 32
-aes_gcm_dec_update_vaes_avx10_256:
-.cfi_startproc
-
-_CET_ENDBR
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-16
-
- movq 16(%rsp),%r12
-
- vbroadcasti32x4 .Lbswap_mask(%rip),%ymm8
- vbroadcasti32x4 .Lgfpoly(%rip),%ymm31
-
-
-
- vmovdqu (%r12),%xmm10
- vpshufb %xmm8,%xmm10,%xmm10
- vbroadcasti32x4 (%r8),%ymm12
- vpshufb %ymm8,%ymm12,%ymm12
-
-
-
- movl 240(%rcx),%r10d
- leal -20(,%r10,4),%r10d
-
-
-
-
- leaq 96(%rcx,%r10,4),%r11
- vbroadcasti32x4 (%rcx),%ymm13
- vbroadcasti32x4 (%r11),%ymm14
-
-
- vpaddd .Lctr_pattern(%rip),%ymm12,%ymm12
-
-
- vbroadcasti32x4 .Linc_2blocks(%rip),%ymm11
-
-
-
- cmpq $128-1,%rdx
- jbe .Lcrypt_loop_4x_done__func2
-
-
- vmovdqu8 256-128(%r9),%ymm27
- vmovdqu8 256-96(%r9),%ymm28
- vmovdqu8 256-64(%r9),%ymm29
- vmovdqu8 256-32(%r9),%ymm30
- vbroadcasti32x4 -144(%r11),%ymm15
- vbroadcasti32x4 -128(%r11),%ymm16
- vbroadcasti32x4 -112(%r11),%ymm17
- vbroadcasti32x4 -96(%r11),%ymm18
- vbroadcasti32x4 -80(%r11),%ymm19
- vbroadcasti32x4 -64(%r11),%ymm20
- vbroadcasti32x4 -48(%r11),%ymm21
- vbroadcasti32x4 -32(%r11),%ymm22
- vbroadcasti32x4 -16(%r11),%ymm23
-.Lcrypt_loop_4x__func2:
- vmovdqu8 0(%rdi),%ymm4
- vmovdqu8 32(%rdi),%ymm5
- vmovdqu8 64(%rdi),%ymm6
- vmovdqu8 96(%rdi),%ymm7
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm1
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm2
- vpaddd %ymm11,%ymm12,%ymm12
- vpshufb %ymm8,%ymm12,%ymm3
- vpaddd %ymm11,%ymm12,%ymm12
-
-
- vpxord %ymm13,%ymm0,%ymm0
- vpxord %ymm13,%ymm1,%ymm1
- vpxord %ymm13,%ymm2,%ymm2
- vpxord %ymm13,%ymm3,%ymm3
-
- cmpl $24,%r10d
- jl .Laes128__func2
- je .Laes192__func2
-
- vbroadcasti32x4 -208(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
- vbroadcasti32x4 -192(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
-.Laes192__func2:
- vbroadcasti32x4 -176(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
- vbroadcasti32x4 -160(%r11),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- vaesenc %ymm9,%ymm1,%ymm1
- vaesenc %ymm9,%ymm2,%ymm2
- vaesenc %ymm9,%ymm3,%ymm3
-
-.Laes128__func2:
- vpshufb %ymm8,%ymm4,%ymm4
- vpxord %ymm10,%ymm4,%ymm4
- vpshufb %ymm8,%ymm5,%ymm5
- vpshufb %ymm8,%ymm6,%ymm6
-
- vaesenc %ymm15,%ymm0,%ymm0
- vaesenc %ymm15,%ymm1,%ymm1
- vaesenc %ymm15,%ymm2,%ymm2
- vaesenc %ymm15,%ymm3,%ymm3
-
- vpshufb %ymm8,%ymm7,%ymm7
- vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
- vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
- vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
-
- vaesenc %ymm16,%ymm0,%ymm0
- vaesenc %ymm16,%ymm1,%ymm1
- vaesenc %ymm16,%ymm2,%ymm2
- vaesenc %ymm16,%ymm3,%ymm3
-
- vpxord %ymm24,%ymm10,%ymm10
- vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm10
- vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
-
- vaesenc %ymm17,%ymm0,%ymm0
- vaesenc %ymm17,%ymm1,%ymm1
- vaesenc %ymm17,%ymm2,%ymm2
- vaesenc %ymm17,%ymm3,%ymm3
-
- vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
-
- vaesenc %ymm18,%ymm0,%ymm0
- vaesenc %ymm18,%ymm1,%ymm1
- vaesenc %ymm18,%ymm2,%ymm2
- vaesenc %ymm18,%ymm3,%ymm3
-
- vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
- vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
-
- vaesenc %ymm19,%ymm0,%ymm0
- vaesenc %ymm19,%ymm1,%ymm1
- vaesenc %ymm19,%ymm2,%ymm2
- vaesenc %ymm19,%ymm3,%ymm3
-
- vpternlogd $0x96,%ymm26,%ymm25,%ymm24
- vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
- vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
- vpxord %ymm25,%ymm24,%ymm24
-
- vaesenc %ymm20,%ymm0,%ymm0
- vaesenc %ymm20,%ymm1,%ymm1
- vaesenc %ymm20,%ymm2,%ymm2
- vaesenc %ymm20,%ymm3,%ymm3
-
- vpshufd $0x4e,%ymm10,%ymm10
- vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
- vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
- vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
-
- vaesenc %ymm21,%ymm0,%ymm0
- vaesenc %ymm21,%ymm1,%ymm1
- vaesenc %ymm21,%ymm2,%ymm2
- vaesenc %ymm21,%ymm3,%ymm3
-
- vpternlogd $0x96,%ymm26,%ymm10,%ymm24
- vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
- vpternlogd $0x96,%ymm6,%ymm5,%ymm4
- vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
-
- vaesenc %ymm22,%ymm0,%ymm0
- vaesenc %ymm22,%ymm1,%ymm1
- vaesenc %ymm22,%ymm2,%ymm2
- vaesenc %ymm22,%ymm3,%ymm3
-
- vpxord %ymm7,%ymm4,%ymm10
- vpshufd $0x4e,%ymm24,%ymm24
- vpternlogd $0x96,%ymm25,%ymm24,%ymm10
-
- vaesenc %ymm23,%ymm0,%ymm0
- vaesenc %ymm23,%ymm1,%ymm1
- vaesenc %ymm23,%ymm2,%ymm2
- vaesenc %ymm23,%ymm3,%ymm3
-
- vextracti32x4 $1,%ymm10,%xmm4
- vpxord %xmm4,%xmm10,%xmm10
-
-
-
-
- vpxord 0(%rdi),%ymm14,%ymm4
- vpxord 32(%rdi),%ymm14,%ymm5
- vpxord 64(%rdi),%ymm14,%ymm6
- vpxord 96(%rdi),%ymm14,%ymm7
-
-
-
- vaesenclast %ymm4,%ymm0,%ymm4
- vaesenclast %ymm5,%ymm1,%ymm5
- vaesenclast %ymm6,%ymm2,%ymm6
- vaesenclast %ymm7,%ymm3,%ymm7
-
-
- vmovdqu8 %ymm4,0(%rsi)
- vmovdqu8 %ymm5,32(%rsi)
- vmovdqu8 %ymm6,64(%rsi)
- vmovdqu8 %ymm7,96(%rsi)
-
- subq $-128,%rdi
- subq $-128,%rsi
- addq $-128,%rdx
- cmpq $128-1,%rdx
- ja .Lcrypt_loop_4x__func2
-.Lcrypt_loop_4x_done__func2:
-
- testq %rdx,%rdx
- jz .Ldone__func2
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- movq %rdx,%rax
- negq %rax
- andq $-16,%rax
- leaq 256(%r9,%rax,1),%r8
- vpxor %xmm4,%xmm4,%xmm4
- vpxor %xmm5,%xmm5,%xmm5
- vpxor %xmm6,%xmm6,%xmm6
-
- cmpq $32,%rdx
- jb .Lpartial_vec__func2
-
-.Lcrypt_loop_1x__func2:
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpaddd %ymm11,%ymm12,%ymm12
- vpxord %ymm13,%ymm0,%ymm0
- leaq 16(%rcx),%rax
-.Lvaesenc_loop_tail_full_vec__func2:
- vbroadcasti32x4 (%rax),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- addq $16,%rax
- cmpq %rax,%r11
- jne .Lvaesenc_loop_tail_full_vec__func2
- vaesenclast %ymm14,%ymm0,%ymm0
-
-
- vmovdqu8 (%rdi),%ymm1
- vpxord %ymm1,%ymm0,%ymm0
- vmovdqu8 %ymm0,(%rsi)
-
-
- vmovdqu8 (%r8),%ymm30
- vpshufb %ymm8,%ymm1,%ymm0
- vpxord %ymm10,%ymm0,%ymm0
- vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
- vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
- vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
- vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
- vpxord %ymm7,%ymm4,%ymm4
- vpternlogd $0x96,%ymm2,%ymm1,%ymm5
- vpxord %ymm3,%ymm6,%ymm6
-
- vpxor %xmm10,%xmm10,%xmm10
-
- addq $32,%r8
- addq $32,%rdi
- addq $32,%rsi
- subq $32,%rdx
- cmpq $32,%rdx
- jae .Lcrypt_loop_1x__func2
-
- testq %rdx,%rdx
- jz .Lreduce__func2
-
-.Lpartial_vec__func2:
-
-
-
-
- movq $-1,%rax
- bzhiq %rdx,%rax,%rax
- kmovd %eax,%k1
- addq $15,%rdx
- andq $-16,%rdx
- movq $-1,%rax
- bzhiq %rdx,%rax,%rax
- kmovd %eax,%k2
-
-
-
- vpshufb %ymm8,%ymm12,%ymm0
- vpxord %ymm13,%ymm0,%ymm0
- leaq 16(%rcx),%rax
-.Lvaesenc_loop_tail_partialvec__func2:
- vbroadcasti32x4 (%rax),%ymm9
- vaesenc %ymm9,%ymm0,%ymm0
- addq $16,%rax
- cmpq %rax,%r11
- jne .Lvaesenc_loop_tail_partialvec__func2
- vaesenclast %ymm14,%ymm0,%ymm0
-
-
- vmovdqu8 (%rdi),%ymm1{%k1}{z}
- vpxord %ymm1,%ymm0,%ymm0
- vmovdqu8 %ymm0,(%rsi){%k1}
-
-
-
-
-
-
-
-
-
-
-
-
-
- vmovdqu8 (%r8),%ymm30{%k2}{z}
-
- vpshufb %ymm8,%ymm1,%ymm0
- vpxord %ymm10,%ymm0,%ymm0
- vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
- vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
- vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
- vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
- vpxord %ymm7,%ymm4,%ymm4
- vpternlogd $0x96,%ymm2,%ymm1,%ymm5
- vpxord %ymm3,%ymm6,%ymm6
-
-
-.Lreduce__func2:
-
- vpclmulqdq $0x01,%ymm4,%ymm31,%ymm0
- vpshufd $0x4e,%ymm4,%ymm4
- vpternlogd $0x96,%ymm0,%ymm4,%ymm5
- vpclmulqdq $0x01,%ymm5,%ymm31,%ymm0
- vpshufd $0x4e,%ymm5,%ymm5
- vpternlogd $0x96,%ymm0,%ymm5,%ymm6
-
- vextracti32x4 $1,%ymm6,%xmm0
- vpxord %xmm0,%xmm6,%xmm10
-
-
-.Ldone__func2:
-
- vpshufb %xmm8,%xmm10,%xmm10
- vmovdqu %xmm10,(%r12)
-
- vzeroupper
- popq %r12
-.cfi_adjust_cfa_offset -8
-.cfi_restore %r12
- ret
-
-.cfi_endproc
-.size aes_gcm_dec_update_vaes_avx10_256, . - aes_gcm_dec_update_vaes_avx10_256
+.size gcm_init_vpclmulqdq_avx10_512, . - gcm_init_vpclmulqdq_avx10_512
.globl gcm_ghash_vpclmulqdq_avx10_512
.hidden gcm_ghash_vpclmulqdq_avx10_512
.type gcm_ghash_vpclmulqdq_avx10_512,@function
@@ -1232,7 +205,7 @@
cmpq $64,%rcx
- jb .Laad_blockbyblock__func2
+ jb .Laad_blockbyblock__func1
@@ -1243,7 +216,7 @@
vmovdqu8 256-64(%rsi),%zmm9
cmpq $256-1,%rcx
- jbe .Laad_loop_1x__func2
+ jbe .Laad_loop_1x__func1
vmovdqu8 256-256(%rsi),%zmm6
@@ -1251,7 +224,7 @@
vmovdqu8 256-128(%rsi),%zmm8
-.Laad_loop_4x__func2:
+.Laad_loop_4x__func1:
vmovdqu8 0(%rdx),%zmm0
vmovdqu8 64(%rdx),%zmm1
vmovdqu8 128(%rdx),%zmm2
@@ -1300,12 +273,12 @@
subq $-256,%rdx
addq $-256,%rcx
cmpq $256-1,%rcx
- ja .Laad_loop_4x__func2
+ ja .Laad_loop_4x__func1
cmpq $64,%rcx
- jb .Laad_large_done__func2
-.Laad_loop_1x__func2:
+ jb .Laad_large_done__func1
+.Laad_loop_1x__func1:
vmovdqu8 (%rdx),%zmm0
vpshufb %zmm4,%zmm0,%zmm0
vpxord %zmm0,%zmm5,%zmm5
@@ -1330,19 +303,19 @@
addq $64,%rdx
subq $64,%rcx
cmpq $64,%rcx
- jae .Laad_loop_1x__func2
+ jae .Laad_loop_1x__func1
-.Laad_large_done__func2:
+.Laad_large_done__func1:
vzeroupper
-.Laad_blockbyblock__func2:
+.Laad_blockbyblock__func1:
testq %rcx,%rcx
- jz .Laad_done__func2
+ jz .Laad_done__func1
vmovdqu 256-16(%rsi),%xmm9
-.Laad_loop_blockbyblock__func2:
+.Laad_loop_blockbyblock__func1:
vmovdqu (%rdx),%xmm0
vpshufb %xmm4,%xmm0,%xmm0
vpxor %xmm0,%xmm5,%xmm5
@@ -1360,9 +333,9 @@
addq $16,%rdx
subq $16,%rcx
- jnz .Laad_loop_blockbyblock__func2
+ jnz .Laad_loop_blockbyblock__func1
-.Laad_done__func2:
+.Laad_done__func1:
vpshufb %xmm4,%xmm5,%xmm5
vmovdqu %xmm5,(%rdi)
@@ -1420,7 +393,7 @@
cmpq $256-1,%rdx
- jbe .Lcrypt_loop_4x_done__func3
+ jbe .Lcrypt_loop_4x_done__func1
vmovdqu8 256-256(%r9),%zmm27
@@ -1447,7 +420,7 @@
vpxord %zmm13,%zmm3,%zmm3
leaq 16(%rcx),%rax
-.Lvaesenc_loop_first_4_vecs__func3:
+.Lvaesenc_loop_first_4_vecs__func1:
vbroadcasti32x4 (%rax),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
vaesenc %zmm9,%zmm1,%zmm1
@@ -1456,7 +429,7 @@
addq $16,%rax
cmpq %rax,%r11
- jne .Lvaesenc_loop_first_4_vecs__func3
+ jne .Lvaesenc_loop_first_4_vecs__func1
@@ -1482,7 +455,7 @@
subq $-256,%rsi
addq $-256,%rdx
cmpq $256-1,%rdx
- jbe .Lghash_last_ciphertext_4x__func3
+ jbe .Lghash_last_ciphertext_4x__func1
vbroadcasti32x4 -144(%r11),%zmm15
vbroadcasti32x4 -128(%r11),%zmm16
vbroadcasti32x4 -112(%r11),%zmm17
@@ -1492,7 +465,7 @@
vbroadcasti32x4 -48(%r11),%zmm21
vbroadcasti32x4 -32(%r11),%zmm22
vbroadcasti32x4 -16(%r11),%zmm23
-.Lcrypt_loop_4x__func3:
+.Lcrypt_loop_4x__func1:
@@ -1512,8 +485,8 @@
vpxord %zmm13,%zmm3,%zmm3
cmpl $24,%r10d
- jl .Laes128__func3
- je .Laes192__func3
+ jl .Laes128__func1
+ je .Laes192__func1
vbroadcasti32x4 -208(%r11),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
@@ -1527,7 +500,7 @@
vaesenc %zmm9,%zmm2,%zmm2
vaesenc %zmm9,%zmm3,%zmm3
-.Laes192__func3:
+.Laes192__func1:
vbroadcasti32x4 -176(%r11),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
vaesenc %zmm9,%zmm1,%zmm1
@@ -1540,7 +513,7 @@
vaesenc %zmm9,%zmm2,%zmm2
vaesenc %zmm9,%zmm3,%zmm3
-.Laes128__func3:
+.Laes128__func1:
vpshufb %zmm8,%zmm4,%zmm4
vpxord %zmm10,%zmm4,%zmm4
vpshufb %zmm8,%zmm5,%zmm5
@@ -1661,8 +634,8 @@
subq $-256,%rsi
addq $-256,%rdx
cmpq $256-1,%rdx
- ja .Lcrypt_loop_4x__func3
-.Lghash_last_ciphertext_4x__func3:
+ ja .Lcrypt_loop_4x__func1
+.Lghash_last_ciphertext_4x__func1:
vpshufb %zmm8,%zmm4,%zmm4
vpxord %zmm10,%zmm4,%zmm4
vpshufb %zmm8,%zmm5,%zmm5
@@ -1704,10 +677,10 @@
vpxord %xmm4,%xmm10,%xmm10
vpternlogd $0x96,%xmm5,%xmm6,%xmm10
-.Lcrypt_loop_4x_done__func3:
+.Lcrypt_loop_4x_done__func1:
testq %rdx,%rdx
- jz .Ldone__func3
+ jz .Ldone__func1
@@ -1737,9 +710,9 @@
vpxor %xmm6,%xmm6,%xmm6
cmpq $64,%rdx
- jb .Lpartial_vec__func3
+ jb .Lpartial_vec__func1
-.Lcrypt_loop_1x__func3:
+.Lcrypt_loop_1x__func1:
@@ -1747,12 +720,12 @@
vpaddd %zmm11,%zmm12,%zmm12
vpxord %zmm13,%zmm0,%zmm0
leaq 16(%rcx),%rax
-.Lvaesenc_loop_tail_full_vec__func3:
+.Lvaesenc_loop_tail_full_vec__func1:
vbroadcasti32x4 (%rax),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
addq $16,%rax
cmpq %rax,%r11
- jne .Lvaesenc_loop_tail_full_vec__func3
+ jne .Lvaesenc_loop_tail_full_vec__func1
vaesenclast %zmm14,%zmm0,%zmm0
@@ -1779,12 +752,12 @@
addq $64,%rsi
subq $64,%rdx
cmpq $64,%rdx
- jae .Lcrypt_loop_1x__func3
+ jae .Lcrypt_loop_1x__func1
testq %rdx,%rdx
- jz .Lreduce__func3
+ jz .Lreduce__func1
-.Lpartial_vec__func3:
+.Lpartial_vec__func1:
@@ -1803,12 +776,12 @@
vpshufb %zmm8,%zmm12,%zmm0
vpxord %zmm13,%zmm0,%zmm0
leaq 16(%rcx),%rax
-.Lvaesenc_loop_tail_partialvec__func3:
+.Lvaesenc_loop_tail_partialvec__func1:
vbroadcasti32x4 (%rax),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
addq $16,%rax
cmpq %rax,%r11
- jne .Lvaesenc_loop_tail_partialvec__func3
+ jne .Lvaesenc_loop_tail_partialvec__func1
vaesenclast %zmm14,%zmm0,%zmm0
@@ -1841,7 +814,7 @@
vpxord %zmm3,%zmm6,%zmm6
-.Lreduce__func3:
+.Lreduce__func1:
vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0
vpshufd $0x4e,%zmm4,%zmm4
@@ -1857,7 +830,7 @@
vpternlogd $0x96,%xmm1,%xmm2,%xmm10
-.Ldone__func3:
+.Ldone__func1:
vpshufb %xmm8,%xmm10,%xmm10
vmovdqu %xmm10,(%r12)
@@ -1915,7 +888,7 @@
cmpq $256-1,%rdx
- jbe .Lcrypt_loop_4x_done__func4
+ jbe .Lcrypt_loop_4x_done__func2
vmovdqu8 256-256(%r9),%zmm27
@@ -1931,7 +904,7 @@
vbroadcasti32x4 -48(%r11),%zmm21
vbroadcasti32x4 -32(%r11),%zmm22
vbroadcasti32x4 -16(%r11),%zmm23
-.Lcrypt_loop_4x__func4:
+.Lcrypt_loop_4x__func2:
vmovdqu8 0(%rdi),%zmm4
vmovdqu8 64(%rdi),%zmm5
vmovdqu8 128(%rdi),%zmm6
@@ -1955,8 +928,8 @@
vpxord %zmm13,%zmm3,%zmm3
cmpl $24,%r10d
- jl .Laes128__func4
- je .Laes192__func4
+ jl .Laes128__func2
+ je .Laes192__func2
vbroadcasti32x4 -208(%r11),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
@@ -1970,7 +943,7 @@
vaesenc %zmm9,%zmm2,%zmm2
vaesenc %zmm9,%zmm3,%zmm3
-.Laes192__func4:
+.Laes192__func2:
vbroadcasti32x4 -176(%r11),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
vaesenc %zmm9,%zmm1,%zmm1
@@ -1983,7 +956,7 @@
vaesenc %zmm9,%zmm2,%zmm2
vaesenc %zmm9,%zmm3,%zmm3
-.Laes128__func4:
+.Laes128__func2:
vpshufb %zmm8,%zmm4,%zmm4
vpxord %zmm10,%zmm4,%zmm4
vpshufb %zmm8,%zmm5,%zmm5
@@ -2104,11 +1077,11 @@
subq $-256,%rsi
addq $-256,%rdx
cmpq $256-1,%rdx
- ja .Lcrypt_loop_4x__func4
-.Lcrypt_loop_4x_done__func4:
+ ja .Lcrypt_loop_4x__func2
+.Lcrypt_loop_4x_done__func2:
testq %rdx,%rdx
- jz .Ldone__func4
+ jz .Ldone__func2
@@ -2138,9 +1111,9 @@
vpxor %xmm6,%xmm6,%xmm6
cmpq $64,%rdx
- jb .Lpartial_vec__func4
+ jb .Lpartial_vec__func2
-.Lcrypt_loop_1x__func4:
+.Lcrypt_loop_1x__func2:
@@ -2148,12 +1121,12 @@
vpaddd %zmm11,%zmm12,%zmm12
vpxord %zmm13,%zmm0,%zmm0
leaq 16(%rcx),%rax
-.Lvaesenc_loop_tail_full_vec__func4:
+.Lvaesenc_loop_tail_full_vec__func2:
vbroadcasti32x4 (%rax),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
addq $16,%rax
cmpq %rax,%r11
- jne .Lvaesenc_loop_tail_full_vec__func4
+ jne .Lvaesenc_loop_tail_full_vec__func2
vaesenclast %zmm14,%zmm0,%zmm0
@@ -2180,12 +1153,12 @@
addq $64,%rsi
subq $64,%rdx
cmpq $64,%rdx
- jae .Lcrypt_loop_1x__func4
+ jae .Lcrypt_loop_1x__func2
testq %rdx,%rdx
- jz .Lreduce__func4
+ jz .Lreduce__func2
-.Lpartial_vec__func4:
+.Lpartial_vec__func2:
@@ -2204,12 +1177,12 @@
vpshufb %zmm8,%zmm12,%zmm0
vpxord %zmm13,%zmm0,%zmm0
leaq 16(%rcx),%rax
-.Lvaesenc_loop_tail_partialvec__func4:
+.Lvaesenc_loop_tail_partialvec__func2:
vbroadcasti32x4 (%rax),%zmm9
vaesenc %zmm9,%zmm0,%zmm0
addq $16,%rax
cmpq %rax,%r11
- jne .Lvaesenc_loop_tail_partialvec__func4
+ jne .Lvaesenc_loop_tail_partialvec__func2
vaesenclast %zmm14,%zmm0,%zmm0
@@ -2242,7 +1215,7 @@
vpxord %zmm3,%zmm6,%zmm6
-.Lreduce__func4:
+.Lreduce__func2:
vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0
vpshufd $0x4e,%zmm4,%zmm4
@@ -2258,7 +1231,7 @@
vpternlogd $0x96,%xmm1,%xmm2,%xmm10
-.Ldone__func4:
+.Ldone__func2:
vpshufb %xmm8,%xmm10,%xmm10
vmovdqu %xmm10,(%r12)
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-win.asm b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
index 258f923..fb9f896 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-win.asm
+++ b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
@@ -88,15 +88,15 @@
$L$SEH_end_gcm_gmult_vpclmulqdq_avx10_5:
-global gcm_init_vpclmulqdq_avx10
+global gcm_init_vpclmulqdq_avx10_512
ALIGN 32
-gcm_init_vpclmulqdq_avx10:
+gcm_init_vpclmulqdq_avx10_512:
_CET_ENDBR
- lea r8,[((256-32))+rcx]
+ lea r8,[((256-64))+rcx]
@@ -124,7 +124,7 @@
vpternlogd xmm3,xmm0,XMMWORD[$L$gfpoly_and_internal_carrybit],0x78
- vbroadcasti32x4 ymm5,YMMWORD[$L$gfpoly]
+ vbroadcasti32x4 zmm5,ZMMWORD[$L$gfpoly]
@@ -149,16 +149,6 @@
vinserti128 ymm3,ymm4,xmm3,1
vinserti128 ymm4,ymm4,xmm4,1
-
- vmovdqu8 YMMWORD[r8],ymm3
-
-
-
-
-
- mov eax,7
-$L$precompute_next__func1:
- sub r8,32
vpclmulqdq ymm0,ymm3,ymm4,0x00
vpclmulqdq ymm1,ymm3,ymm4,0x01
vpclmulqdq ymm2,ymm3,ymm4,0x10
@@ -166,12 +156,36 @@
vpclmulqdq ymm2,ymm5,ymm0,0x01
vpshufd ymm0,ymm0,0x4e
vpternlogd ymm1,ymm0,ymm2,0x96
- vpclmulqdq ymm3,ymm3,ymm4,0x11
+ vpclmulqdq ymm4,ymm3,ymm4,0x11
vpclmulqdq ymm0,ymm5,ymm1,0x01
vpshufd ymm1,ymm1,0x4e
- vpternlogd ymm3,ymm1,ymm0,0x96
+ vpternlogd ymm4,ymm1,ymm0,0x96
- vmovdqu8 YMMWORD[r8],ymm3
+ vinserti64x4 zmm3,zmm4,ymm3,1
+ vshufi64x2 zmm4,zmm4,zmm4,0
+
+ vmovdqu8 ZMMWORD[r8],zmm3
+
+
+
+
+
+ mov eax,3
+$L$precompute_next__func1:
+ sub r8,64
+ vpclmulqdq zmm0,zmm3,zmm4,0x00
+ vpclmulqdq zmm1,zmm3,zmm4,0x01
+ vpclmulqdq zmm2,zmm3,zmm4,0x10
+ vpxord zmm1,zmm1,zmm2
+ vpclmulqdq zmm2,zmm5,zmm0,0x01
+ vpshufd zmm0,zmm0,0x4e
+ vpternlogd zmm1,zmm0,zmm2,0x96
+ vpclmulqdq zmm3,zmm3,zmm4,0x11
+ vpclmulqdq zmm0,zmm5,zmm1,0x01
+ vpshufd zmm1,zmm1,0x4e
+ vpternlogd zmm3,zmm1,zmm0,0x96
+
+ vmovdqu8 ZMMWORD[r8],zmm3
dec eax
jnz NEAR $L$precompute_next__func1
@@ -180,1150 +194,6 @@
-global gcm_ghash_vpclmulqdq_avx10_256
-
-ALIGN 32
-gcm_ghash_vpclmulqdq_avx10_256:
-
-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1:
-_CET_ENDBR
- sub rsp,136
-$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_2:
- movdqa XMMWORD[rsp],xmm6
-$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_3:
- movdqa XMMWORD[16+rsp],xmm7
-$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_4:
- movdqa XMMWORD[32+rsp],xmm8
-$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_5:
- movdqa XMMWORD[48+rsp],xmm9
-$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_6:
- movdqa XMMWORD[64+rsp],xmm10
-$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_7:
- movdqa XMMWORD[80+rsp],xmm11
-$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_8:
- movdqa XMMWORD[96+rsp],xmm12
-$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_9:
- movdqa XMMWORD[112+rsp],xmm13
-$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_10:
-
-$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx10_256_11:
-
-
-
-
- vmovdqu xmm4,XMMWORD[$L$bswap_mask]
- vmovdqu xmm10,XMMWORD[$L$gfpoly]
-
-
- vmovdqu xmm5,XMMWORD[rcx]
- vpshufb xmm5,xmm5,xmm4
-
-
- cmp r9,32
- jb NEAR $L$aad_blockbyblock__func1
-
-
-
- vshufi64x2 ymm4,ymm4,ymm4,0
- vshufi64x2 ymm10,ymm10,ymm10,0
-
-
- vmovdqu8 ymm9,YMMWORD[((256-32))+rdx]
-
- cmp r9,4*32-1
- jbe NEAR $L$aad_loop_1x__func1
-
-
- vmovdqu8 ymm6,YMMWORD[((256-128))+rdx]
- vmovdqu8 ymm7,YMMWORD[((256-96))+rdx]
- vmovdqu8 ymm8,YMMWORD[((256-64))+rdx]
-
-
-$L$aad_loop_4x__func1:
- vmovdqu8 ymm0,YMMWORD[r8]
- vmovdqu8 ymm1,YMMWORD[32+r8]
- vmovdqu8 ymm2,YMMWORD[64+r8]
- vmovdqu8 ymm3,YMMWORD[96+r8]
- vpshufb ymm0,ymm0,ymm4
- vpxord ymm0,ymm0,ymm5
- vpshufb ymm1,ymm1,ymm4
- vpshufb ymm2,ymm2,ymm4
- vpshufb ymm3,ymm3,ymm4
- vpclmulqdq ymm5,ymm0,ymm6,0x00
- vpclmulqdq ymm11,ymm1,ymm7,0x00
- vpclmulqdq ymm12,ymm2,ymm8,0x00
- vpxord ymm5,ymm5,ymm11
- vpclmulqdq ymm13,ymm3,ymm9,0x00
- vpternlogd ymm5,ymm12,ymm13,0x96
- vpclmulqdq ymm11,ymm0,ymm6,0x01
- vpclmulqdq ymm12,ymm1,ymm7,0x01
- vpclmulqdq ymm13,ymm2,ymm8,0x01
- vpternlogd ymm11,ymm12,ymm13,0x96
- vpclmulqdq ymm12,ymm3,ymm9,0x01
- vpclmulqdq ymm13,ymm0,ymm6,0x10
- vpternlogd ymm11,ymm12,ymm13,0x96
- vpclmulqdq ymm12,ymm1,ymm7,0x10
- vpclmulqdq ymm13,ymm2,ymm8,0x10
- vpternlogd ymm11,ymm12,ymm13,0x96
- vpclmulqdq ymm13,ymm10,ymm5,0x01
- vpclmulqdq ymm12,ymm3,ymm9,0x10
- vpxord ymm11,ymm11,ymm12
- vpshufd ymm5,ymm5,0x4e
- vpclmulqdq ymm0,ymm0,ymm6,0x11
- vpclmulqdq ymm1,ymm1,ymm7,0x11
- vpclmulqdq ymm2,ymm2,ymm8,0x11
- vpternlogd ymm11,ymm5,ymm13,0x96
- vpclmulqdq ymm3,ymm3,ymm9,0x11
- vpternlogd ymm0,ymm1,ymm2,0x96
- vpclmulqdq ymm12,ymm10,ymm11,0x01
- vpxord ymm5,ymm0,ymm3
- vpshufd ymm11,ymm11,0x4e
- vpternlogd ymm5,ymm11,ymm12,0x96
- vextracti32x4 xmm0,ymm5,1
- vpxord xmm5,xmm5,xmm0
-
- sub r8,-4*32
- add r9,-4*32
- cmp r9,4*32-1
- ja NEAR $L$aad_loop_4x__func1
-
-
- cmp r9,32
- jb NEAR $L$aad_large_done__func1
-$L$aad_loop_1x__func1:
- vmovdqu8 ymm0,YMMWORD[r8]
- vpshufb ymm0,ymm0,ymm4
- vpxord ymm5,ymm5,ymm0
- vpclmulqdq ymm0,ymm5,ymm9,0x00
- vpclmulqdq ymm1,ymm5,ymm9,0x01
- vpclmulqdq ymm2,ymm5,ymm9,0x10
- vpxord ymm1,ymm1,ymm2
- vpclmulqdq ymm2,ymm10,ymm0,0x01
- vpshufd ymm0,ymm0,0x4e
- vpternlogd ymm1,ymm0,ymm2,0x96
- vpclmulqdq ymm5,ymm5,ymm9,0x11
- vpclmulqdq ymm0,ymm10,ymm1,0x01
- vpshufd ymm1,ymm1,0x4e
- vpternlogd ymm5,ymm1,ymm0,0x96
-
- vextracti32x4 xmm0,ymm5,1
- vpxord xmm5,xmm5,xmm0
-
- add r8,32
- sub r9,32
- cmp r9,32
- jae NEAR $L$aad_loop_1x__func1
-
-$L$aad_large_done__func1:
-
-
- vzeroupper
-
-
-$L$aad_blockbyblock__func1:
- test r9,r9
- jz NEAR $L$aad_done__func1
- vmovdqu xmm9,XMMWORD[((256-16))+rdx]
-$L$aad_loop_blockbyblock__func1:
- vmovdqu xmm0,XMMWORD[r8]
- vpshufb xmm0,xmm0,xmm4
- vpxor xmm5,xmm5,xmm0
- vpclmulqdq xmm0,xmm5,xmm9,0x00
- vpclmulqdq xmm1,xmm5,xmm9,0x01
- vpclmulqdq xmm2,xmm5,xmm9,0x10
- vpxord xmm1,xmm1,xmm2
- vpclmulqdq xmm2,xmm10,xmm0,0x01
- vpshufd xmm0,xmm0,0x4e
- vpternlogd xmm1,xmm0,xmm2,0x96
- vpclmulqdq xmm5,xmm5,xmm9,0x11
- vpclmulqdq xmm0,xmm10,xmm1,0x01
- vpshufd xmm1,xmm1,0x4e
- vpternlogd xmm5,xmm1,xmm0,0x96
-
- add r8,16
- sub r9,16
- jnz NEAR $L$aad_loop_blockbyblock__func1
-
-$L$aad_done__func1:
-
- vpshufb xmm5,xmm5,xmm4
- vmovdqu XMMWORD[rcx],xmm5
- movdqa xmm6,XMMWORD[rsp]
- movdqa xmm7,XMMWORD[16+rsp]
- movdqa xmm8,XMMWORD[32+rsp]
- movdqa xmm9,XMMWORD[48+rsp]
- movdqa xmm10,XMMWORD[64+rsp]
- movdqa xmm11,XMMWORD[80+rsp]
- movdqa xmm12,XMMWORD[96+rsp]
- movdqa xmm13,XMMWORD[112+rsp]
- add rsp,136
- ret
-$L$SEH_end_gcm_ghash_vpclmulqdq_avx10_256_12:
-
-
-global aes_gcm_enc_update_vaes_avx10_256
-
-ALIGN 32
-aes_gcm_enc_update_vaes_avx10_256:
-
-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1:
-_CET_ENDBR
- push rsi
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_2:
- push rdi
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_3:
- push r12
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_4:
-
- mov rsi,QWORD[64+rsp]
- mov rdi,QWORD[72+rsp]
- mov r12,QWORD[80+rsp]
- sub rsp,160
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_5:
- movdqa XMMWORD[rsp],xmm6
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_6:
- movdqa XMMWORD[16+rsp],xmm7
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_7:
- movdqa XMMWORD[32+rsp],xmm8
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_8:
- movdqa XMMWORD[48+rsp],xmm9
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_9:
- movdqa XMMWORD[64+rsp],xmm10
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_10:
- movdqa XMMWORD[80+rsp],xmm11
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_11:
- movdqa XMMWORD[96+rsp],xmm12
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_12:
- movdqa XMMWORD[112+rsp],xmm13
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_13:
- movdqa XMMWORD[128+rsp],xmm14
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_14:
- movdqa XMMWORD[144+rsp],xmm15
-$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_15:
-
-$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx10_256_16:
-%ifdef BORINGSSL_DISPATCH_TEST
-EXTERN BORINGSSL_function_hit
- mov BYTE[((BORINGSSL_function_hit+6))],1
-%endif
-
- vbroadcasti32x4 ymm8,YMMWORD[$L$bswap_mask]
- vbroadcasti32x4 ymm31,YMMWORD[$L$gfpoly]
-
-
-
- vmovdqu xmm10,XMMWORD[r12]
- vpshufb xmm10,xmm10,xmm8
- vbroadcasti32x4 ymm12,YMMWORD[rsi]
- vpshufb ymm12,ymm12,ymm8
-
-
-
- mov r10d,DWORD[240+r9]
- lea r10d,[((-20))+r10*4]
-
-
-
-
- lea r11,[96+r10*4+r9]
- vbroadcasti32x4 ymm13,YMMWORD[r9]
- vbroadcasti32x4 ymm14,YMMWORD[r11]
-
-
- vpaddd ymm12,ymm12,YMMWORD[$L$ctr_pattern]
-
-
- vbroadcasti32x4 ymm11,YMMWORD[$L$inc_2blocks]
-
-
-
- cmp r8,4*32-1
- jbe NEAR $L$crypt_loop_4x_done__func1
-
-
- vmovdqu8 ymm27,YMMWORD[((256-128))+rdi]
- vmovdqu8 ymm28,YMMWORD[((256-96))+rdi]
- vmovdqu8 ymm29,YMMWORD[((256-64))+rdi]
- vmovdqu8 ymm30,YMMWORD[((256-32))+rdi]
-
-
-
-
- vpshufb ymm0,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
- vpshufb ymm1,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
- vpshufb ymm2,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
- vpshufb ymm3,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
-
-
- vpxord ymm0,ymm0,ymm13
- vpxord ymm1,ymm1,ymm13
- vpxord ymm2,ymm2,ymm13
- vpxord ymm3,ymm3,ymm13
-
- lea rax,[16+r9]
-$L$vaesenc_loop_first_4_vecs__func1:
- vbroadcasti32x4 ymm9,YMMWORD[rax]
- vaesenc ymm0,ymm0,ymm9
- vaesenc ymm1,ymm1,ymm9
- vaesenc ymm2,ymm2,ymm9
- vaesenc ymm3,ymm3,ymm9
-
- add rax,16
- cmp r11,rax
- jne NEAR $L$vaesenc_loop_first_4_vecs__func1
-
-
-
- vpxord ymm4,ymm14,YMMWORD[rcx]
- vpxord ymm5,ymm14,YMMWORD[32+rcx]
- vpxord ymm6,ymm14,YMMWORD[64+rcx]
- vpxord ymm7,ymm14,YMMWORD[96+rcx]
-
-
-
- vaesenclast ymm4,ymm0,ymm4
- vaesenclast ymm5,ymm1,ymm5
- vaesenclast ymm6,ymm2,ymm6
- vaesenclast ymm7,ymm3,ymm7
-
-
- vmovdqu8 YMMWORD[rdx],ymm4
- vmovdqu8 YMMWORD[32+rdx],ymm5
- vmovdqu8 YMMWORD[64+rdx],ymm6
- vmovdqu8 YMMWORD[96+rdx],ymm7
-
- sub rcx,-4*32
- sub rdx,-4*32
- add r8,-4*32
- cmp r8,4*32-1
- jbe NEAR $L$ghash_last_ciphertext_4x__func1
- vbroadcasti32x4 ymm15,YMMWORD[((-144))+r11]
- vbroadcasti32x4 ymm16,YMMWORD[((-128))+r11]
- vbroadcasti32x4 ymm17,YMMWORD[((-112))+r11]
- vbroadcasti32x4 ymm18,YMMWORD[((-96))+r11]
- vbroadcasti32x4 ymm19,YMMWORD[((-80))+r11]
- vbroadcasti32x4 ymm20,YMMWORD[((-64))+r11]
- vbroadcasti32x4 ymm21,YMMWORD[((-48))+r11]
- vbroadcasti32x4 ymm22,YMMWORD[((-32))+r11]
- vbroadcasti32x4 ymm23,YMMWORD[((-16))+r11]
-$L$crypt_loop_4x__func1:
-
-
-
- vpshufb ymm0,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
- vpshufb ymm1,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
- vpshufb ymm2,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
- vpshufb ymm3,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
-
-
- vpxord ymm0,ymm0,ymm13
- vpxord ymm1,ymm1,ymm13
- vpxord ymm2,ymm2,ymm13
- vpxord ymm3,ymm3,ymm13
-
- cmp r10d,24
- jl NEAR $L$aes128__func1
- je NEAR $L$aes192__func1
-
- vbroadcasti32x4 ymm9,YMMWORD[((-208))+r11]
- vaesenc ymm0,ymm0,ymm9
- vaesenc ymm1,ymm1,ymm9
- vaesenc ymm2,ymm2,ymm9
- vaesenc ymm3,ymm3,ymm9
-
- vbroadcasti32x4 ymm9,YMMWORD[((-192))+r11]
- vaesenc ymm0,ymm0,ymm9
- vaesenc ymm1,ymm1,ymm9
- vaesenc ymm2,ymm2,ymm9
- vaesenc ymm3,ymm3,ymm9
-
-$L$aes192__func1:
- vbroadcasti32x4 ymm9,YMMWORD[((-176))+r11]
- vaesenc ymm0,ymm0,ymm9
- vaesenc ymm1,ymm1,ymm9
- vaesenc ymm2,ymm2,ymm9
- vaesenc ymm3,ymm3,ymm9
-
- vbroadcasti32x4 ymm9,YMMWORD[((-160))+r11]
- vaesenc ymm0,ymm0,ymm9
- vaesenc ymm1,ymm1,ymm9
- vaesenc ymm2,ymm2,ymm9
- vaesenc ymm3,ymm3,ymm9
-
-$L$aes128__func1:
- vpshufb ymm4,ymm4,ymm8
- vpxord ymm4,ymm4,ymm10
- vpshufb ymm5,ymm5,ymm8
- vpshufb ymm6,ymm6,ymm8
-
- vaesenc ymm0,ymm0,ymm15
- vaesenc ymm1,ymm1,ymm15
- vaesenc ymm2,ymm2,ymm15
- vaesenc ymm3,ymm3,ymm15
-
- vpshufb ymm7,ymm7,ymm8
- vpclmulqdq ymm10,ymm4,ymm27,0x00
- vpclmulqdq ymm24,ymm5,ymm28,0x00
- vpclmulqdq ymm25,ymm6,ymm29,0x00
-
- vaesenc ymm0,ymm0,ymm16
- vaesenc ymm1,ymm1,ymm16
- vaesenc ymm2,ymm2,ymm16
- vaesenc ymm3,ymm3,ymm16
-
- vpxord ymm10,ymm10,ymm24
- vpclmulqdq ymm26,ymm7,ymm30,0x00
- vpternlogd ymm10,ymm25,ymm26,0x96
- vpclmulqdq ymm24,ymm4,ymm27,0x01
-
- vaesenc ymm0,ymm0,ymm17
- vaesenc ymm1,ymm1,ymm17
- vaesenc ymm2,ymm2,ymm17
- vaesenc ymm3,ymm3,ymm17
-
- vpclmulqdq ymm25,ymm5,ymm28,0x01
- vpclmulqdq ymm26,ymm6,ymm29,0x01
- vpternlogd ymm24,ymm25,ymm26,0x96
- vpclmulqdq ymm25,ymm7,ymm30,0x01
-
- vaesenc ymm0,ymm0,ymm18
- vaesenc ymm1,ymm1,ymm18
- vaesenc ymm2,ymm2,ymm18
- vaesenc ymm3,ymm3,ymm18
-
- vpclmulqdq ymm26,ymm4,ymm27,0x10
- vpternlogd ymm24,ymm25,ymm26,0x96
- vpclmulqdq ymm25,ymm5,ymm28,0x10
- vpclmulqdq ymm26,ymm6,ymm29,0x10
-
- vaesenc ymm0,ymm0,ymm19
- vaesenc ymm1,ymm1,ymm19
- vaesenc ymm2,ymm2,ymm19
- vaesenc ymm3,ymm3,ymm19
-
- vpternlogd ymm24,ymm25,ymm26,0x96
- vpclmulqdq ymm26,ymm31,ymm10,0x01
- vpclmulqdq ymm25,ymm7,ymm30,0x10
- vpxord ymm24,ymm24,ymm25
-
- vaesenc ymm0,ymm0,ymm20
- vaesenc ymm1,ymm1,ymm20
- vaesenc ymm2,ymm2,ymm20
- vaesenc ymm3,ymm3,ymm20
-
- vpshufd ymm10,ymm10,0x4e
- vpclmulqdq ymm4,ymm4,ymm27,0x11
- vpclmulqdq ymm5,ymm5,ymm28,0x11
- vpclmulqdq ymm6,ymm6,ymm29,0x11
-
- vaesenc ymm0,ymm0,ymm21
- vaesenc ymm1,ymm1,ymm21
- vaesenc ymm2,ymm2,ymm21
- vaesenc ymm3,ymm3,ymm21
-
- vpternlogd ymm24,ymm10,ymm26,0x96
- vpclmulqdq ymm7,ymm7,ymm30,0x11
- vpternlogd ymm4,ymm5,ymm6,0x96
- vpclmulqdq ymm25,ymm31,ymm24,0x01
-
- vaesenc ymm0,ymm0,ymm22
- vaesenc ymm1,ymm1,ymm22
- vaesenc ymm2,ymm2,ymm22
- vaesenc ymm3,ymm3,ymm22
-
- vpxord ymm10,ymm4,ymm7
- vpshufd ymm24,ymm24,0x4e
- vpternlogd ymm10,ymm24,ymm25,0x96
-
- vaesenc ymm0,ymm0,ymm23
- vaesenc ymm1,ymm1,ymm23
- vaesenc ymm2,ymm2,ymm23
- vaesenc ymm3,ymm3,ymm23
-
- vextracti32x4 xmm4,ymm10,1
- vpxord xmm10,xmm10,xmm4
-
-
-
-
- vpxord ymm4,ymm14,YMMWORD[rcx]
- vpxord ymm5,ymm14,YMMWORD[32+rcx]
- vpxord ymm6,ymm14,YMMWORD[64+rcx]
- vpxord ymm7,ymm14,YMMWORD[96+rcx]
-
-
-
- vaesenclast ymm4,ymm0,ymm4
- vaesenclast ymm5,ymm1,ymm5
- vaesenclast ymm6,ymm2,ymm6
- vaesenclast ymm7,ymm3,ymm7
-
-
- vmovdqu8 YMMWORD[rdx],ymm4
- vmovdqu8 YMMWORD[32+rdx],ymm5
- vmovdqu8 YMMWORD[64+rdx],ymm6
- vmovdqu8 YMMWORD[96+rdx],ymm7
-
- sub rcx,-4*32
- sub rdx,-4*32
- add r8,-4*32
- cmp r8,4*32-1
- ja NEAR $L$crypt_loop_4x__func1
-$L$ghash_last_ciphertext_4x__func1:
- vpshufb ymm4,ymm4,ymm8
- vpxord ymm4,ymm4,ymm10
- vpshufb ymm5,ymm5,ymm8
- vpshufb ymm6,ymm6,ymm8
- vpshufb ymm7,ymm7,ymm8
- vpclmulqdq ymm10,ymm4,ymm27,0x00
- vpclmulqdq ymm24,ymm5,ymm28,0x00
- vpclmulqdq ymm25,ymm6,ymm29,0x00
- vpxord ymm10,ymm10,ymm24
- vpclmulqdq ymm26,ymm7,ymm30,0x00
- vpternlogd ymm10,ymm25,ymm26,0x96
- vpclmulqdq ymm24,ymm4,ymm27,0x01
- vpclmulqdq ymm25,ymm5,ymm28,0x01
- vpclmulqdq ymm26,ymm6,ymm29,0x01
- vpternlogd ymm24,ymm25,ymm26,0x96
- vpclmulqdq ymm25,ymm7,ymm30,0x01
- vpclmulqdq ymm26,ymm4,ymm27,0x10
- vpternlogd ymm24,ymm25,ymm26,0x96
- vpclmulqdq ymm25,ymm5,ymm28,0x10
- vpclmulqdq ymm26,ymm6,ymm29,0x10
- vpternlogd ymm24,ymm25,ymm26,0x96
- vpclmulqdq ymm26,ymm31,ymm10,0x01
- vpclmulqdq ymm25,ymm7,ymm30,0x10
- vpxord ymm24,ymm24,ymm25
- vpshufd ymm10,ymm10,0x4e
- vpclmulqdq ymm4,ymm4,ymm27,0x11
- vpclmulqdq ymm5,ymm5,ymm28,0x11
- vpclmulqdq ymm6,ymm6,ymm29,0x11
- vpternlogd ymm24,ymm10,ymm26,0x96
- vpclmulqdq ymm7,ymm7,ymm30,0x11
- vpternlogd ymm4,ymm5,ymm6,0x96
- vpclmulqdq ymm25,ymm31,ymm24,0x01
- vpxord ymm10,ymm4,ymm7
- vpshufd ymm24,ymm24,0x4e
- vpternlogd ymm10,ymm24,ymm25,0x96
- vextracti32x4 xmm4,ymm10,1
- vpxord xmm10,xmm10,xmm4
-
-$L$crypt_loop_4x_done__func1:
-
- test r8,r8
- jz NEAR $L$done__func1
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- mov rax,r8
- neg rax
- and rax,-16
- lea rsi,[256+rax*1+rdi]
- vpxor xmm4,xmm4,xmm4
- vpxor xmm5,xmm5,xmm5
- vpxor xmm6,xmm6,xmm6
-
- cmp r8,32
- jb NEAR $L$partial_vec__func1
-
-$L$crypt_loop_1x__func1:
-
-
-
- vpshufb ymm0,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
- vpxord ymm0,ymm0,ymm13
- lea rax,[16+r9]
-$L$vaesenc_loop_tail_full_vec__func1:
- vbroadcasti32x4 ymm9,YMMWORD[rax]
- vaesenc ymm0,ymm0,ymm9
- add rax,16
- cmp r11,rax
- jne NEAR $L$vaesenc_loop_tail_full_vec__func1
- vaesenclast ymm0,ymm0,ymm14
-
-
- vmovdqu8 ymm1,YMMWORD[rcx]
- vpxord ymm0,ymm0,ymm1
- vmovdqu8 YMMWORD[rdx],ymm0
-
-
- vmovdqu8 ymm30,YMMWORD[rsi]
- vpshufb ymm0,ymm0,ymm8
- vpxord ymm0,ymm0,ymm10
- vpclmulqdq ymm7,ymm0,ymm30,0x00
- vpclmulqdq ymm1,ymm0,ymm30,0x01
- vpclmulqdq ymm2,ymm0,ymm30,0x10
- vpclmulqdq ymm3,ymm0,ymm30,0x11
- vpxord ymm4,ymm4,ymm7
- vpternlogd ymm5,ymm1,ymm2,0x96
- vpxord ymm6,ymm6,ymm3
-
- vpxor xmm10,xmm10,xmm10
-
- add rsi,32
- add rcx,32
- add rdx,32
- sub r8,32
- cmp r8,32
- jae NEAR $L$crypt_loop_1x__func1
-
- test r8,r8
- jz NEAR $L$reduce__func1
-
-$L$partial_vec__func1:
-
-
-
-
- mov rax,-1
- bzhi rax,rax,r8
- kmovd k1,eax
- add r8,15
- and r8,-16
- mov rax,-1
- bzhi rax,rax,r8
- kmovd k2,eax
-
-
-
- vpshufb ymm0,ymm12,ymm8
- vpxord ymm0,ymm0,ymm13
- lea rax,[16+r9]
-$L$vaesenc_loop_tail_partialvec__func1:
- vbroadcasti32x4 ymm9,YMMWORD[rax]
- vaesenc ymm0,ymm0,ymm9
- add rax,16
- cmp r11,rax
- jne NEAR $L$vaesenc_loop_tail_partialvec__func1
- vaesenclast ymm0,ymm0,ymm14
-
-
- vmovdqu8 ymm1{k1}{z},[rcx]
- vpxord ymm0,ymm0,ymm1
- vmovdqu8 YMMWORD[rdx]{k1},ymm0
-
-
-
-
-
-
-
-
-
-
-
-
-
- vmovdqu8 ymm30{k2}{z},[rsi]
- vmovdqu8 ymm1{k1}{z},ymm0
- vpshufb ymm0,ymm1,ymm8
- vpxord ymm0,ymm0,ymm10
- vpclmulqdq ymm7,ymm0,ymm30,0x00
- vpclmulqdq ymm1,ymm0,ymm30,0x01
- vpclmulqdq ymm2,ymm0,ymm30,0x10
- vpclmulqdq ymm3,ymm0,ymm30,0x11
- vpxord ymm4,ymm4,ymm7
- vpternlogd ymm5,ymm1,ymm2,0x96
- vpxord ymm6,ymm6,ymm3
-
-
-$L$reduce__func1:
-
- vpclmulqdq ymm0,ymm31,ymm4,0x01
- vpshufd ymm4,ymm4,0x4e
- vpternlogd ymm5,ymm4,ymm0,0x96
- vpclmulqdq ymm0,ymm31,ymm5,0x01
- vpshufd ymm5,ymm5,0x4e
- vpternlogd ymm6,ymm5,ymm0,0x96
-
- vextracti32x4 xmm0,ymm6,1
- vpxord xmm10,xmm6,xmm0
-
-
-$L$done__func1:
-
- vpshufb xmm10,xmm10,xmm8
- vmovdqu XMMWORD[r12],xmm10
-
- vzeroupper
- movdqa xmm6,XMMWORD[rsp]
- movdqa xmm7,XMMWORD[16+rsp]
- movdqa xmm8,XMMWORD[32+rsp]
- movdqa xmm9,XMMWORD[48+rsp]
- movdqa xmm10,XMMWORD[64+rsp]
- movdqa xmm11,XMMWORD[80+rsp]
- movdqa xmm12,XMMWORD[96+rsp]
- movdqa xmm13,XMMWORD[112+rsp]
- movdqa xmm14,XMMWORD[128+rsp]
- movdqa xmm15,XMMWORD[144+rsp]
- add rsp,160
- pop r12
- pop rdi
- pop rsi
- ret
-$L$SEH_end_aes_gcm_enc_update_vaes_avx10_256_17:
-
-
-global aes_gcm_dec_update_vaes_avx10_256
-
-ALIGN 32
-aes_gcm_dec_update_vaes_avx10_256:
-
-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1:
-_CET_ENDBR
- push rsi
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_2:
- push rdi
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_3:
- push r12
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_4:
-
- mov rsi,QWORD[64+rsp]
- mov rdi,QWORD[72+rsp]
- mov r12,QWORD[80+rsp]
- sub rsp,160
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_5:
- movdqa XMMWORD[rsp],xmm6
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_6:
- movdqa XMMWORD[16+rsp],xmm7
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_7:
- movdqa XMMWORD[32+rsp],xmm8
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_8:
- movdqa XMMWORD[48+rsp],xmm9
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_9:
- movdqa XMMWORD[64+rsp],xmm10
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_10:
- movdqa XMMWORD[80+rsp],xmm11
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_11:
- movdqa XMMWORD[96+rsp],xmm12
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_12:
- movdqa XMMWORD[112+rsp],xmm13
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_13:
- movdqa XMMWORD[128+rsp],xmm14
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_14:
- movdqa XMMWORD[144+rsp],xmm15
-$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_15:
-
-$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx10_256_16:
-
- vbroadcasti32x4 ymm8,YMMWORD[$L$bswap_mask]
- vbroadcasti32x4 ymm31,YMMWORD[$L$gfpoly]
-
-
-
- vmovdqu xmm10,XMMWORD[r12]
- vpshufb xmm10,xmm10,xmm8
- vbroadcasti32x4 ymm12,YMMWORD[rsi]
- vpshufb ymm12,ymm12,ymm8
-
-
-
- mov r10d,DWORD[240+r9]
- lea r10d,[((-20))+r10*4]
-
-
-
-
- lea r11,[96+r10*4+r9]
- vbroadcasti32x4 ymm13,YMMWORD[r9]
- vbroadcasti32x4 ymm14,YMMWORD[r11]
-
-
- vpaddd ymm12,ymm12,YMMWORD[$L$ctr_pattern]
-
-
- vbroadcasti32x4 ymm11,YMMWORD[$L$inc_2blocks]
-
-
-
- cmp r8,4*32-1
- jbe NEAR $L$crypt_loop_4x_done__func2
-
-
- vmovdqu8 ymm27,YMMWORD[((256-128))+rdi]
- vmovdqu8 ymm28,YMMWORD[((256-96))+rdi]
- vmovdqu8 ymm29,YMMWORD[((256-64))+rdi]
- vmovdqu8 ymm30,YMMWORD[((256-32))+rdi]
- vbroadcasti32x4 ymm15,YMMWORD[((-144))+r11]
- vbroadcasti32x4 ymm16,YMMWORD[((-128))+r11]
- vbroadcasti32x4 ymm17,YMMWORD[((-112))+r11]
- vbroadcasti32x4 ymm18,YMMWORD[((-96))+r11]
- vbroadcasti32x4 ymm19,YMMWORD[((-80))+r11]
- vbroadcasti32x4 ymm20,YMMWORD[((-64))+r11]
- vbroadcasti32x4 ymm21,YMMWORD[((-48))+r11]
- vbroadcasti32x4 ymm22,YMMWORD[((-32))+r11]
- vbroadcasti32x4 ymm23,YMMWORD[((-16))+r11]
-$L$crypt_loop_4x__func2:
- vmovdqu8 ymm4,YMMWORD[rcx]
- vmovdqu8 ymm5,YMMWORD[32+rcx]
- vmovdqu8 ymm6,YMMWORD[64+rcx]
- vmovdqu8 ymm7,YMMWORD[96+rcx]
-
-
-
- vpshufb ymm0,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
- vpshufb ymm1,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
- vpshufb ymm2,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
- vpshufb ymm3,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
-
-
- vpxord ymm0,ymm0,ymm13
- vpxord ymm1,ymm1,ymm13
- vpxord ymm2,ymm2,ymm13
- vpxord ymm3,ymm3,ymm13
-
- cmp r10d,24
- jl NEAR $L$aes128__func2
- je NEAR $L$aes192__func2
-
- vbroadcasti32x4 ymm9,YMMWORD[((-208))+r11]
- vaesenc ymm0,ymm0,ymm9
- vaesenc ymm1,ymm1,ymm9
- vaesenc ymm2,ymm2,ymm9
- vaesenc ymm3,ymm3,ymm9
-
- vbroadcasti32x4 ymm9,YMMWORD[((-192))+r11]
- vaesenc ymm0,ymm0,ymm9
- vaesenc ymm1,ymm1,ymm9
- vaesenc ymm2,ymm2,ymm9
- vaesenc ymm3,ymm3,ymm9
-
-$L$aes192__func2:
- vbroadcasti32x4 ymm9,YMMWORD[((-176))+r11]
- vaesenc ymm0,ymm0,ymm9
- vaesenc ymm1,ymm1,ymm9
- vaesenc ymm2,ymm2,ymm9
- vaesenc ymm3,ymm3,ymm9
-
- vbroadcasti32x4 ymm9,YMMWORD[((-160))+r11]
- vaesenc ymm0,ymm0,ymm9
- vaesenc ymm1,ymm1,ymm9
- vaesenc ymm2,ymm2,ymm9
- vaesenc ymm3,ymm3,ymm9
-
-$L$aes128__func2:
- vpshufb ymm4,ymm4,ymm8
- vpxord ymm4,ymm4,ymm10
- vpshufb ymm5,ymm5,ymm8
- vpshufb ymm6,ymm6,ymm8
-
- vaesenc ymm0,ymm0,ymm15
- vaesenc ymm1,ymm1,ymm15
- vaesenc ymm2,ymm2,ymm15
- vaesenc ymm3,ymm3,ymm15
-
- vpshufb ymm7,ymm7,ymm8
- vpclmulqdq ymm10,ymm4,ymm27,0x00
- vpclmulqdq ymm24,ymm5,ymm28,0x00
- vpclmulqdq ymm25,ymm6,ymm29,0x00
-
- vaesenc ymm0,ymm0,ymm16
- vaesenc ymm1,ymm1,ymm16
- vaesenc ymm2,ymm2,ymm16
- vaesenc ymm3,ymm3,ymm16
-
- vpxord ymm10,ymm10,ymm24
- vpclmulqdq ymm26,ymm7,ymm30,0x00
- vpternlogd ymm10,ymm25,ymm26,0x96
- vpclmulqdq ymm24,ymm4,ymm27,0x01
-
- vaesenc ymm0,ymm0,ymm17
- vaesenc ymm1,ymm1,ymm17
- vaesenc ymm2,ymm2,ymm17
- vaesenc ymm3,ymm3,ymm17
-
- vpclmulqdq ymm25,ymm5,ymm28,0x01
- vpclmulqdq ymm26,ymm6,ymm29,0x01
- vpternlogd ymm24,ymm25,ymm26,0x96
- vpclmulqdq ymm25,ymm7,ymm30,0x01
-
- vaesenc ymm0,ymm0,ymm18
- vaesenc ymm1,ymm1,ymm18
- vaesenc ymm2,ymm2,ymm18
- vaesenc ymm3,ymm3,ymm18
-
- vpclmulqdq ymm26,ymm4,ymm27,0x10
- vpternlogd ymm24,ymm25,ymm26,0x96
- vpclmulqdq ymm25,ymm5,ymm28,0x10
- vpclmulqdq ymm26,ymm6,ymm29,0x10
-
- vaesenc ymm0,ymm0,ymm19
- vaesenc ymm1,ymm1,ymm19
- vaesenc ymm2,ymm2,ymm19
- vaesenc ymm3,ymm3,ymm19
-
- vpternlogd ymm24,ymm25,ymm26,0x96
- vpclmulqdq ymm26,ymm31,ymm10,0x01
- vpclmulqdq ymm25,ymm7,ymm30,0x10
- vpxord ymm24,ymm24,ymm25
-
- vaesenc ymm0,ymm0,ymm20
- vaesenc ymm1,ymm1,ymm20
- vaesenc ymm2,ymm2,ymm20
- vaesenc ymm3,ymm3,ymm20
-
- vpshufd ymm10,ymm10,0x4e
- vpclmulqdq ymm4,ymm4,ymm27,0x11
- vpclmulqdq ymm5,ymm5,ymm28,0x11
- vpclmulqdq ymm6,ymm6,ymm29,0x11
-
- vaesenc ymm0,ymm0,ymm21
- vaesenc ymm1,ymm1,ymm21
- vaesenc ymm2,ymm2,ymm21
- vaesenc ymm3,ymm3,ymm21
-
- vpternlogd ymm24,ymm10,ymm26,0x96
- vpclmulqdq ymm7,ymm7,ymm30,0x11
- vpternlogd ymm4,ymm5,ymm6,0x96
- vpclmulqdq ymm25,ymm31,ymm24,0x01
-
- vaesenc ymm0,ymm0,ymm22
- vaesenc ymm1,ymm1,ymm22
- vaesenc ymm2,ymm2,ymm22
- vaesenc ymm3,ymm3,ymm22
-
- vpxord ymm10,ymm4,ymm7
- vpshufd ymm24,ymm24,0x4e
- vpternlogd ymm10,ymm24,ymm25,0x96
-
- vaesenc ymm0,ymm0,ymm23
- vaesenc ymm1,ymm1,ymm23
- vaesenc ymm2,ymm2,ymm23
- vaesenc ymm3,ymm3,ymm23
-
- vextracti32x4 xmm4,ymm10,1
- vpxord xmm10,xmm10,xmm4
-
-
-
-
- vpxord ymm4,ymm14,YMMWORD[rcx]
- vpxord ymm5,ymm14,YMMWORD[32+rcx]
- vpxord ymm6,ymm14,YMMWORD[64+rcx]
- vpxord ymm7,ymm14,YMMWORD[96+rcx]
-
-
-
- vaesenclast ymm4,ymm0,ymm4
- vaesenclast ymm5,ymm1,ymm5
- vaesenclast ymm6,ymm2,ymm6
- vaesenclast ymm7,ymm3,ymm7
-
-
- vmovdqu8 YMMWORD[rdx],ymm4
- vmovdqu8 YMMWORD[32+rdx],ymm5
- vmovdqu8 YMMWORD[64+rdx],ymm6
- vmovdqu8 YMMWORD[96+rdx],ymm7
-
- sub rcx,-4*32
- sub rdx,-4*32
- add r8,-4*32
- cmp r8,4*32-1
- ja NEAR $L$crypt_loop_4x__func2
-$L$crypt_loop_4x_done__func2:
-
- test r8,r8
- jz NEAR $L$done__func2
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- mov rax,r8
- neg rax
- and rax,-16
- lea rsi,[256+rax*1+rdi]
- vpxor xmm4,xmm4,xmm4
- vpxor xmm5,xmm5,xmm5
- vpxor xmm6,xmm6,xmm6
-
- cmp r8,32
- jb NEAR $L$partial_vec__func2
-
-$L$crypt_loop_1x__func2:
-
-
-
- vpshufb ymm0,ymm12,ymm8
- vpaddd ymm12,ymm12,ymm11
- vpxord ymm0,ymm0,ymm13
- lea rax,[16+r9]
-$L$vaesenc_loop_tail_full_vec__func2:
- vbroadcasti32x4 ymm9,YMMWORD[rax]
- vaesenc ymm0,ymm0,ymm9
- add rax,16
- cmp r11,rax
- jne NEAR $L$vaesenc_loop_tail_full_vec__func2
- vaesenclast ymm0,ymm0,ymm14
-
-
- vmovdqu8 ymm1,YMMWORD[rcx]
- vpxord ymm0,ymm0,ymm1
- vmovdqu8 YMMWORD[rdx],ymm0
-
-
- vmovdqu8 ymm30,YMMWORD[rsi]
- vpshufb ymm0,ymm1,ymm8
- vpxord ymm0,ymm0,ymm10
- vpclmulqdq ymm7,ymm0,ymm30,0x00
- vpclmulqdq ymm1,ymm0,ymm30,0x01
- vpclmulqdq ymm2,ymm0,ymm30,0x10
- vpclmulqdq ymm3,ymm0,ymm30,0x11
- vpxord ymm4,ymm4,ymm7
- vpternlogd ymm5,ymm1,ymm2,0x96
- vpxord ymm6,ymm6,ymm3
-
- vpxor xmm10,xmm10,xmm10
-
- add rsi,32
- add rcx,32
- add rdx,32
- sub r8,32
- cmp r8,32
- jae NEAR $L$crypt_loop_1x__func2
-
- test r8,r8
- jz NEAR $L$reduce__func2
-
-$L$partial_vec__func2:
-
-
-
-
- mov rax,-1
- bzhi rax,rax,r8
- kmovd k1,eax
- add r8,15
- and r8,-16
- mov rax,-1
- bzhi rax,rax,r8
- kmovd k2,eax
-
-
-
- vpshufb ymm0,ymm12,ymm8
- vpxord ymm0,ymm0,ymm13
- lea rax,[16+r9]
-$L$vaesenc_loop_tail_partialvec__func2:
- vbroadcasti32x4 ymm9,YMMWORD[rax]
- vaesenc ymm0,ymm0,ymm9
- add rax,16
- cmp r11,rax
- jne NEAR $L$vaesenc_loop_tail_partialvec__func2
- vaesenclast ymm0,ymm0,ymm14
-
-
- vmovdqu8 ymm1{k1}{z},[rcx]
- vpxord ymm0,ymm0,ymm1
- vmovdqu8 YMMWORD[rdx]{k1},ymm0
-
-
-
-
-
-
-
-
-
-
-
-
-
- vmovdqu8 ymm30{k2}{z},[rsi]
-
- vpshufb ymm0,ymm1,ymm8
- vpxord ymm0,ymm0,ymm10
- vpclmulqdq ymm7,ymm0,ymm30,0x00
- vpclmulqdq ymm1,ymm0,ymm30,0x01
- vpclmulqdq ymm2,ymm0,ymm30,0x10
- vpclmulqdq ymm3,ymm0,ymm30,0x11
- vpxord ymm4,ymm4,ymm7
- vpternlogd ymm5,ymm1,ymm2,0x96
- vpxord ymm6,ymm6,ymm3
-
-
-$L$reduce__func2:
-
- vpclmulqdq ymm0,ymm31,ymm4,0x01
- vpshufd ymm4,ymm4,0x4e
- vpternlogd ymm5,ymm4,ymm0,0x96
- vpclmulqdq ymm0,ymm31,ymm5,0x01
- vpshufd ymm5,ymm5,0x4e
- vpternlogd ymm6,ymm5,ymm0,0x96
-
- vextracti32x4 xmm0,ymm6,1
- vpxord xmm10,xmm6,xmm0
-
-
-$L$done__func2:
-
- vpshufb xmm10,xmm10,xmm8
- vmovdqu XMMWORD[r12],xmm10
-
- vzeroupper
- movdqa xmm6,XMMWORD[rsp]
- movdqa xmm7,XMMWORD[16+rsp]
- movdqa xmm8,XMMWORD[32+rsp]
- movdqa xmm9,XMMWORD[48+rsp]
- movdqa xmm10,XMMWORD[64+rsp]
- movdqa xmm11,XMMWORD[80+rsp]
- movdqa xmm12,XMMWORD[96+rsp]
- movdqa xmm13,XMMWORD[112+rsp]
- movdqa xmm14,XMMWORD[128+rsp]
- movdqa xmm15,XMMWORD[144+rsp]
- add rsp,160
- pop r12
- pop rdi
- pop rsi
- ret
-$L$SEH_end_aes_gcm_dec_update_vaes_avx10_256_17:
-
-
global gcm_ghash_vpclmulqdq_avx10_512
ALIGN 32
@@ -1364,7 +234,7 @@
cmp r9,64
- jb NEAR $L$aad_blockbyblock__func2
+ jb NEAR $L$aad_blockbyblock__func1
@@ -1375,7 +245,7 @@
vmovdqu8 zmm9,ZMMWORD[((256-64))+rdx]
cmp r9,4*64-1
- jbe NEAR $L$aad_loop_1x__func2
+ jbe NEAR $L$aad_loop_1x__func1
vmovdqu8 zmm6,ZMMWORD[((256-256))+rdx]
@@ -1383,7 +253,7 @@
vmovdqu8 zmm8,ZMMWORD[((256-128))+rdx]
-$L$aad_loop_4x__func2:
+$L$aad_loop_4x__func1:
vmovdqu8 zmm0,ZMMWORD[r8]
vmovdqu8 zmm1,ZMMWORD[64+r8]
vmovdqu8 zmm2,ZMMWORD[128+r8]
@@ -1432,12 +302,12 @@
sub r8,-4*64
add r9,-4*64
cmp r9,4*64-1
- ja NEAR $L$aad_loop_4x__func2
+ ja NEAR $L$aad_loop_4x__func1
cmp r9,64
- jb NEAR $L$aad_large_done__func2
-$L$aad_loop_1x__func2:
+ jb NEAR $L$aad_large_done__func1
+$L$aad_loop_1x__func1:
vmovdqu8 zmm0,ZMMWORD[r8]
vpshufb zmm0,zmm0,zmm4
vpxord zmm5,zmm5,zmm0
@@ -1462,19 +332,19 @@
add r8,64
sub r9,64
cmp r9,64
- jae NEAR $L$aad_loop_1x__func2
+ jae NEAR $L$aad_loop_1x__func1
-$L$aad_large_done__func2:
+$L$aad_large_done__func1:
vzeroupper
-$L$aad_blockbyblock__func2:
+$L$aad_blockbyblock__func1:
test r9,r9
- jz NEAR $L$aad_done__func2
+ jz NEAR $L$aad_done__func1
vmovdqu xmm9,XMMWORD[((256-16))+rdx]
-$L$aad_loop_blockbyblock__func2:
+$L$aad_loop_blockbyblock__func1:
vmovdqu xmm0,XMMWORD[r8]
vpshufb xmm0,xmm0,xmm4
vpxor xmm5,xmm5,xmm0
@@ -1492,9 +362,9 @@
add r8,16
sub r9,16
- jnz NEAR $L$aad_loop_blockbyblock__func2
+ jnz NEAR $L$aad_loop_blockbyblock__func1
-$L$aad_done__func2:
+$L$aad_done__func1:
vpshufb xmm5,xmm5,xmm4
vmovdqu XMMWORD[rcx],xmm5
@@ -1588,7 +458,7 @@
cmp r8,4*64-1
- jbe NEAR $L$crypt_loop_4x_done__func3
+ jbe NEAR $L$crypt_loop_4x_done__func1
vmovdqu8 zmm27,ZMMWORD[((256-256))+rdi]
@@ -1615,7 +485,7 @@
vpxord zmm3,zmm3,zmm13
lea rax,[16+r9]
-$L$vaesenc_loop_first_4_vecs__func3:
+$L$vaesenc_loop_first_4_vecs__func1:
vbroadcasti32x4 zmm9,ZMMWORD[rax]
vaesenc zmm0,zmm0,zmm9
vaesenc zmm1,zmm1,zmm9
@@ -1624,7 +494,7 @@
add rax,16
cmp r11,rax
- jne NEAR $L$vaesenc_loop_first_4_vecs__func3
+ jne NEAR $L$vaesenc_loop_first_4_vecs__func1
@@ -1650,7 +520,7 @@
sub rdx,-4*64
add r8,-4*64
cmp r8,4*64-1
- jbe NEAR $L$ghash_last_ciphertext_4x__func3
+ jbe NEAR $L$ghash_last_ciphertext_4x__func1
vbroadcasti32x4 zmm15,ZMMWORD[((-144))+r11]
vbroadcasti32x4 zmm16,ZMMWORD[((-128))+r11]
vbroadcasti32x4 zmm17,ZMMWORD[((-112))+r11]
@@ -1660,7 +530,7 @@
vbroadcasti32x4 zmm21,ZMMWORD[((-48))+r11]
vbroadcasti32x4 zmm22,ZMMWORD[((-32))+r11]
vbroadcasti32x4 zmm23,ZMMWORD[((-16))+r11]
-$L$crypt_loop_4x__func3:
+$L$crypt_loop_4x__func1:
@@ -1680,8 +550,8 @@
vpxord zmm3,zmm3,zmm13
cmp r10d,24
- jl NEAR $L$aes128__func3
- je NEAR $L$aes192__func3
+ jl NEAR $L$aes128__func1
+ je NEAR $L$aes192__func1
vbroadcasti32x4 zmm9,ZMMWORD[((-208))+r11]
vaesenc zmm0,zmm0,zmm9
@@ -1695,7 +565,7 @@
vaesenc zmm2,zmm2,zmm9
vaesenc zmm3,zmm3,zmm9
-$L$aes192__func3:
+$L$aes192__func1:
vbroadcasti32x4 zmm9,ZMMWORD[((-176))+r11]
vaesenc zmm0,zmm0,zmm9
vaesenc zmm1,zmm1,zmm9
@@ -1708,7 +578,7 @@
vaesenc zmm2,zmm2,zmm9
vaesenc zmm3,zmm3,zmm9
-$L$aes128__func3:
+$L$aes128__func1:
vpshufb zmm4,zmm4,zmm8
vpxord zmm4,zmm4,zmm10
vpshufb zmm5,zmm5,zmm8
@@ -1829,8 +699,8 @@
sub rdx,-4*64
add r8,-4*64
cmp r8,4*64-1
- ja NEAR $L$crypt_loop_4x__func3
-$L$ghash_last_ciphertext_4x__func3:
+ ja NEAR $L$crypt_loop_4x__func1
+$L$ghash_last_ciphertext_4x__func1:
vpshufb zmm4,zmm4,zmm8
vpxord zmm4,zmm4,zmm10
vpshufb zmm5,zmm5,zmm8
@@ -1872,10 +742,10 @@
vpxord xmm10,xmm10,xmm4
vpternlogd xmm10,xmm6,xmm5,0x96
-$L$crypt_loop_4x_done__func3:
+$L$crypt_loop_4x_done__func1:
test r8,r8
- jz NEAR $L$done__func3
+ jz NEAR $L$done__func1
@@ -1905,9 +775,9 @@
vpxor xmm6,xmm6,xmm6
cmp r8,64
- jb NEAR $L$partial_vec__func3
+ jb NEAR $L$partial_vec__func1
-$L$crypt_loop_1x__func3:
+$L$crypt_loop_1x__func1:
@@ -1915,12 +785,12 @@
vpaddd zmm12,zmm12,zmm11
vpxord zmm0,zmm0,zmm13
lea rax,[16+r9]
-$L$vaesenc_loop_tail_full_vec__func3:
+$L$vaesenc_loop_tail_full_vec__func1:
vbroadcasti32x4 zmm9,ZMMWORD[rax]
vaesenc zmm0,zmm0,zmm9
add rax,16
cmp r11,rax
- jne NEAR $L$vaesenc_loop_tail_full_vec__func3
+ jne NEAR $L$vaesenc_loop_tail_full_vec__func1
vaesenclast zmm0,zmm0,zmm14
@@ -1947,12 +817,12 @@
add rdx,64
sub r8,64
cmp r8,64
- jae NEAR $L$crypt_loop_1x__func3
+ jae NEAR $L$crypt_loop_1x__func1
test r8,r8
- jz NEAR $L$reduce__func3
+ jz NEAR $L$reduce__func1
-$L$partial_vec__func3:
+$L$partial_vec__func1:
@@ -1971,12 +841,12 @@
vpshufb zmm0,zmm12,zmm8
vpxord zmm0,zmm0,zmm13
lea rax,[16+r9]
-$L$vaesenc_loop_tail_partialvec__func3:
+$L$vaesenc_loop_tail_partialvec__func1:
vbroadcasti32x4 zmm9,ZMMWORD[rax]
vaesenc zmm0,zmm0,zmm9
add rax,16
cmp r11,rax
- jne NEAR $L$vaesenc_loop_tail_partialvec__func3
+ jne NEAR $L$vaesenc_loop_tail_partialvec__func1
vaesenclast zmm0,zmm0,zmm14
@@ -2009,7 +879,7 @@
vpxord zmm6,zmm6,zmm3
-$L$reduce__func3:
+$L$reduce__func1:
vpclmulqdq zmm0,zmm31,zmm4,0x01
vpshufd zmm4,zmm4,0x4e
@@ -2025,7 +895,7 @@
vpternlogd xmm10,xmm2,xmm1,0x96
-$L$done__func3:
+$L$done__func1:
vpshufb xmm10,xmm10,xmm8
vmovdqu XMMWORD[r12],xmm10
@@ -2122,7 +992,7 @@
cmp r8,4*64-1
- jbe NEAR $L$crypt_loop_4x_done__func4
+ jbe NEAR $L$crypt_loop_4x_done__func2
vmovdqu8 zmm27,ZMMWORD[((256-256))+rdi]
@@ -2138,7 +1008,7 @@
vbroadcasti32x4 zmm21,ZMMWORD[((-48))+r11]
vbroadcasti32x4 zmm22,ZMMWORD[((-32))+r11]
vbroadcasti32x4 zmm23,ZMMWORD[((-16))+r11]
-$L$crypt_loop_4x__func4:
+$L$crypt_loop_4x__func2:
vmovdqu8 zmm4,ZMMWORD[rcx]
vmovdqu8 zmm5,ZMMWORD[64+rcx]
vmovdqu8 zmm6,ZMMWORD[128+rcx]
@@ -2162,8 +1032,8 @@
vpxord zmm3,zmm3,zmm13
cmp r10d,24
- jl NEAR $L$aes128__func4
- je NEAR $L$aes192__func4
+ jl NEAR $L$aes128__func2
+ je NEAR $L$aes192__func2
vbroadcasti32x4 zmm9,ZMMWORD[((-208))+r11]
vaesenc zmm0,zmm0,zmm9
@@ -2177,7 +1047,7 @@
vaesenc zmm2,zmm2,zmm9
vaesenc zmm3,zmm3,zmm9
-$L$aes192__func4:
+$L$aes192__func2:
vbroadcasti32x4 zmm9,ZMMWORD[((-176))+r11]
vaesenc zmm0,zmm0,zmm9
vaesenc zmm1,zmm1,zmm9
@@ -2190,7 +1060,7 @@
vaesenc zmm2,zmm2,zmm9
vaesenc zmm3,zmm3,zmm9
-$L$aes128__func4:
+$L$aes128__func2:
vpshufb zmm4,zmm4,zmm8
vpxord zmm4,zmm4,zmm10
vpshufb zmm5,zmm5,zmm8
@@ -2311,11 +1181,11 @@
sub rdx,-4*64
add r8,-4*64
cmp r8,4*64-1
- ja NEAR $L$crypt_loop_4x__func4
-$L$crypt_loop_4x_done__func4:
+ ja NEAR $L$crypt_loop_4x__func2
+$L$crypt_loop_4x_done__func2:
test r8,r8
- jz NEAR $L$done__func4
+ jz NEAR $L$done__func2
@@ -2345,9 +1215,9 @@
vpxor xmm6,xmm6,xmm6
cmp r8,64
- jb NEAR $L$partial_vec__func4
+ jb NEAR $L$partial_vec__func2
-$L$crypt_loop_1x__func4:
+$L$crypt_loop_1x__func2:
@@ -2355,12 +1225,12 @@
vpaddd zmm12,zmm12,zmm11
vpxord zmm0,zmm0,zmm13
lea rax,[16+r9]
-$L$vaesenc_loop_tail_full_vec__func4:
+$L$vaesenc_loop_tail_full_vec__func2:
vbroadcasti32x4 zmm9,ZMMWORD[rax]
vaesenc zmm0,zmm0,zmm9
add rax,16
cmp r11,rax
- jne NEAR $L$vaesenc_loop_tail_full_vec__func4
+ jne NEAR $L$vaesenc_loop_tail_full_vec__func2
vaesenclast zmm0,zmm0,zmm14
@@ -2387,12 +1257,12 @@
add rdx,64
sub r8,64
cmp r8,64
- jae NEAR $L$crypt_loop_1x__func4
+ jae NEAR $L$crypt_loop_1x__func2
test r8,r8
- jz NEAR $L$reduce__func4
+ jz NEAR $L$reduce__func2
-$L$partial_vec__func4:
+$L$partial_vec__func2:
@@ -2411,12 +1281,12 @@
vpshufb zmm0,zmm12,zmm8
vpxord zmm0,zmm0,zmm13
lea rax,[16+r9]
-$L$vaesenc_loop_tail_partialvec__func4:
+$L$vaesenc_loop_tail_partialvec__func2:
vbroadcasti32x4 zmm9,ZMMWORD[rax]
vaesenc zmm0,zmm0,zmm9
add rax,16
cmp r11,rax
- jne NEAR $L$vaesenc_loop_tail_partialvec__func4
+ jne NEAR $L$vaesenc_loop_tail_partialvec__func2
vaesenclast zmm0,zmm0,zmm14
@@ -2449,7 +1319,7 @@
vpxord zmm6,zmm6,zmm3
-$L$reduce__func4:
+$L$reduce__func2:
vpclmulqdq zmm0,zmm31,zmm4,0x01
vpshufd zmm4,zmm4,0x4e
@@ -2465,7 +1335,7 @@
vpternlogd xmm10,xmm2,xmm1,0x96
-$L$done__func4:
+$L$done__func2:
vpshufb xmm10,xmm10,xmm8
vmovdqu XMMWORD[r12],xmm10
@@ -2495,18 +1365,6 @@
DD $L$SEH_end_gcm_gmult_vpclmulqdq_avx10_5 wrt ..imagebase
DD $L$SEH_info_gcm_gmult_vpclmulqdq_avx10_0 wrt ..imagebase
- DD $L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1 wrt ..imagebase
- DD $L$SEH_end_gcm_ghash_vpclmulqdq_avx10_256_12 wrt ..imagebase
- DD $L$SEH_info_gcm_ghash_vpclmulqdq_avx10_256_0 wrt ..imagebase
-
- DD $L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1 wrt ..imagebase
- DD $L$SEH_end_aes_gcm_enc_update_vaes_avx10_256_17 wrt ..imagebase
- DD $L$SEH_info_aes_gcm_enc_update_vaes_avx10_256_0 wrt ..imagebase
-
- DD $L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1 wrt ..imagebase
- DD $L$SEH_end_aes_gcm_dec_update_vaes_avx10_256_17 wrt ..imagebase
- DD $L$SEH_info_aes_gcm_dec_update_vaes_avx10_256_0 wrt ..imagebase
-
DD $L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1 wrt ..imagebase
DD $L$SEH_end_gcm_ghash_vpclmulqdq_avx10_512_12 wrt ..imagebase
DD $L$SEH_info_gcm_ghash_vpclmulqdq_avx10_512_0 wrt ..imagebase
@@ -2534,131 +1392,6 @@
DB 34
DW 0
-$L$SEH_info_gcm_ghash_vpclmulqdq_avx10_256_0:
- DB 1
- DB $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx10_256_11-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
- DB 18
- DB 0
- DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_10-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
- DB 216
- DW 7
- DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_9-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
- DB 200
- DW 6
- DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_8-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
- DB 184
- DW 5
- DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
- DB 168
- DW 4
- DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
- DB 152
- DW 3
- DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
- DB 136
- DW 2
- DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
- DB 120
- DW 1
- DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
- DB 104
- DW 0
- DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
- DB 1
- DW 17
-
-$L$SEH_info_aes_gcm_enc_update_vaes_avx10_256_0:
- DB 1
- DB $L$SEH_endprologue_aes_gcm_enc_update_vaes_avx10_256_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 25
- DB 0
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 248
- DW 9
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 232
- DW 8
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 216
- DW 7
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 200
- DW 6
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 184
- DW 5
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 168
- DW 4
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 152
- DW 3
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 136
- DW 2
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 120
- DW 1
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 104
- DW 0
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 1
- DW 20
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 192
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 112
- DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
- DB 96
-
- DW 0
-$L$SEH_info_aes_gcm_dec_update_vaes_avx10_256_0:
- DB 1
- DB $L$SEH_endprologue_aes_gcm_dec_update_vaes_avx10_256_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 25
- DB 0
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 248
- DW 9
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 232
- DW 8
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 216
- DW 7
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 200
- DW 6
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 184
- DW 5
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 168
- DW 4
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 152
- DW 3
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 136
- DW 2
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 120
- DW 1
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 104
- DW 0
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 1
- DW 20
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 192
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 112
- DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
- DB 96
-
- DW 0
$L$SEH_info_gcm_ghash_vpclmulqdq_avx10_512_0:
DB 1
DB $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx10_512_11-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1