Add VAES and VPCLMULQDQ accelerated AES-GCM
Add an AES-GCM implementation for x86_64 that uses VAES, VPCLMULQDQ, and
either AVX10 or a compatible AVX512 feature set. The assembly code is
based on the code I wrote for the Linux kernel
(https://git.kernel.org/linus/b06affb1cb580e13). Some substantial
changes were needed for BoringSSL integration; see the file comment.
The following tables compare the performance of AES-256-GCM before and
after this patch, and also versus the alternative patch from Cloudflare
(https://boringssl-review.googlesource.com/c/boringssl/+/65987/3). All
tables show throughput in MB/s, for implementation name vs. message
length in bytes. All benchmarks were done using EVP_AEAD_CTX_seal() and
EVP_AEAD_CTX_open() with an associated data length of 16 bytes.
AMD Zen 5, Granite Ridge (encryption):
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
-----------+-------+-------+-------+-------+-------+-------+
This patch | 26358 | 21295 | 17402 | 10672 | 7798 | 4840 |
Cloudflare | 22363 | 18330 | 17008 | 10979 | 7070 | 5870 |
Existing | 7194 | 6743 | 6465 | 5404 | 4075 | 3563 |
| 300 | 200 | 64 | 63 | 16 |
-----------+-------+-------+-------+-------+-------+
This patch | 3248 | 2557 | 1359 | 937 | 537 |
Cloudflare | 3624 | 2770 | 1293 | 1028 | 517 |
Existing | 2938 | 2271 | 1266 | 959 | 528 |
AMD Zen 5, Granite Ridge (decryption):
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
-----------+-------+-------+-------+-------+-------+-------+
This patch | 27214 | 22298 | 18824 | 11401 | 8496 | 5399 |
Cloudflare | 22629 | 19257 | 17792 | 11575 | 7807 | 6031 |
Existing | 7122 | 6805 | 6228 | 4922 | 4604 | 3565 |
| 300 | 200 | 64 | 63 | 16 |
-----------+-------+-------+-------+-------+-------+
This patch | 3637 | 2497 | 1483 | 952 | 589 |
Cloudflare | 3714 | 2847 | 1437 | 1030 | 567 |
Existing | 3012 | 2354 | 1514 | 880 | 632 |
AMD Zen 4, Genoa (encryption):
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
-----------+-------+-------+-------+-------+-------+-------+
This patch | 10093 | 8907 | 7614 | 5399 | 4247 | 2719 |
Cloudflare | 9174 | 8073 | 7521 | 5414 | 3786 | 3111 |
Existing | 4239 | 3964 | 3800 | 3186 | 2398 | 2069 |
| 300 | 200 | 64 | 63 | 16 |
-----------+-------+-------+-------+-------+-------+
This patch | 1940 | 1553 | 851 | 581 | 343 |
Cloudflare | 2023 | 1619 | 775 | 619 | 311 |
Existing | 1735 | 1334 | 775 | 573 | 317 |
AMD Zen 4, Genoa (decryption):
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
-----------+-------+-------+-------+-------+-------+-------+
This patch | 10108 | 8922 | 7879 | 5526 | 4250 | 2872 |
Cloudflare | 9441 | 8347 | 7723 | 5366 | 3902 | 3067 |
Existing | 4249 | 3999 | 3810 | 3101 | 2535 | 2026 |
| 300 | 200 | 64 | 63 | 16 |
-----------+-------+-------+-------+-------+-------+
This patch | 2031 | 1536 | 868 | 568 | 346 |
Cloudflare | 1933 | 1579 | 765 | 569 | 300 |
Existing | 1723 | 1381 | 806 | 516 | 345 |
Intel Emerald Rapids (encryption):
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
-----------+-------+-------+-------+-------+-------+-------+
This patch | 13974 | 11827 | 10166 | 6601 | 4904 | 3334 |
Cloudflare | 12735 | 10752 | 9966 | 6709 | 4524 | 3647 |
Existing | 5237 | 4831 | 4639 | 3747 | 2816 | 2409 |
| 300 | 200 | 64 | 63 | 16 |
-----------+-------+-------+-------+-------+-------+
This patch | 2251 | 1763 | 915 | 649 | 363 |
Cloudflare | 2329 | 1850 | 855 | 676 | 342 |
Existing | 1971 | 1502 | 808 | 626 | 359 |
Intel Emerald Rapids (decryption):
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
-----------+-------+-------+-------+-------+-------+-------+
This patch | 14239 | 12180 | 10370 | 6692 | 5305 | 3344 |
Cloudflare | 13348 | 11485 | 10460 | 6736 | 5229 | 3641 |
Existing | 5306 | 4958 | 4702 | 3767 | 3071 | 2432 |
| 300 | 200 | 64 | 63 | 16 |
-----------+-------+-------+-------+-------+-------+
This patch | 2197 | 2077 | 1040 | 628 | 390 |
Cloudflare | 2186 | 1911 | 938 | 615 | 370 |
Existing | 2024 | 1727 | 999 | 599 | 421 |
Intel Sapphire Rapids (encryption):
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
-----------+-------+-------+-------+-------+-------+-------+
This patch | 12726 | 10618 | 9248 | 6012 | 4466 | 2986 |
Cloudflare | 11059 | 9794 | 9071 | 6052 | 4089 | 3306 |
Existing | 4761 | 4397 | 4222 | 3408 | 2560 | 2188 |
| 300 | 200 | 64 | 63 | 16 |
-----------+-------+-------+-------+-------+-------+
This patch | 2051 | 1612 | 838 | 579 | 351 |
Cloudflare | 2110 | 1686 | 775 | 622 | 311 |
Existing | 1792 | 1369 | 733 | 567 | 324 |
Intel Sapphire Rapids (decryption):
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
-----------+-------+-------+-------+-------+-------+-------+
This patch | 12951 | 11100 | 9447 | 6067 | 4862 | 3030 |
Cloudflare | 12165 | 10421 | 9506 | 6126 | 4767 | 3321 |
Existing | 4807 | 4507 | 4275 | 3400 | 2791 | 2216 |
| 300 | 200 | 64 | 63 | 16 |
-----------+-------+-------+-------+-------+-------+
This patch | 2003 | 1894 | 950 | 572 | 357 |
Cloudflare | 1999 | 1741 | 857 | 559 | 328 |
Existing | 1831 | 1571 | 838 | 539 | 382 |
Change-Id: I5b0833d2ffe8fd273cb38a26cd104c52c3532ceb
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/70187
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/build.json b/build.json
index 04d9bf9..9a667f3 100644
--- a/build.json
+++ b/build.json
@@ -132,6 +132,7 @@
],
"perlasm_x86_64": [
{"src": "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl"},
+ {"src": "crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl"},
{"src": "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"},
{"src": "crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl"},
{"src": "crypto/fipsmodule/modes/asm/ghash-x86_64.pl"},
diff --git a/crypto/crypto.cc b/crypto/crypto.cc
index ead0543..9f8eba4 100644
--- a/crypto/crypto.cc
+++ b/crypto/crypto.cc
@@ -54,7 +54,7 @@
// archive, linking on OS X will fail to resolve common symbols. By
// initialising it to zero, it becomes a "data symbol", which isn't so
// affected.
-HIDDEN uint8_t BORINGSSL_function_hit[7] = {0};
+HIDDEN uint8_t BORINGSSL_function_hit[8] = {0};
#endif
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
diff --git a/crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl
new file mode 100644
index 0000000..b65dee9
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl
@@ -0,0 +1,1358 @@
+#!/usr/bin/env perl
+# Copyright 2024 Google LLC
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+#------------------------------------------------------------------------------
+#
+# VAES and VPCLMULQDQ optimized AES-GCM for x86_64
+#
+# This file is based on aes-gcm-avx10-x86_64.S from the Linux kernel
+# (https://git.kernel.org/linus/b06affb1cb580e13). The following notable
+# changes have been made:
+#
+# - Relicensed under BoringSSL's preferred license.
+#
+# - Converted from GNU assembler to "perlasm". This was necessary for
+# compatibility with BoringSSL's Windows builds which use NASM instead of the
+# GNU assembler. It was also necessary for compatibility with the 'delocate'
+# tool used in BoringSSL's FIPS builds.
+#
+# - Added support for the Windows ABI.
+#
+# - Changed function prototypes to be compatible with what BoringSSL wants.
+#
+# - Removed the optimized finalization function, as BoringSSL doesn't want it.
+#
+# - Added a single-block GHASH multiplication function, as BoringSSL needs this.
+#
+# - Added optimization for large amounts of AAD.
+#
+#------------------------------------------------------------------------------
+#
+# This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
+# support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and
+# either AVX512 or AVX10. Some of the functions, notably the encryption and
+# decryption update functions which are the most performance-critical, are
+# provided in two variants generated from a macro: one using 256-bit vectors
+# (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The
+# other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
+#
+# The functions that use 512-bit vectors are intended for CPUs that support
+# 512-bit vectors *and* where using them doesn't cause significant
+# downclocking. They require the following CPU features:
+#
+# VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512)
+#
+# The other functions require the following CPU features:
+#
+# VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256)
+#
+# Note that we use "avx10" in the names of the functions as a shorthand to
+# really mean "AVX10 or a certain set of AVX512 features". Due to Intel's
+# introduction of AVX512 and then its replacement by AVX10, there doesn't seem
+# to be a simple way to name things that makes sense on all CPUs.
+#
+# Note that the macros that support both 256-bit and 512-bit vectors could
+# fairly easily be changed to support 128-bit too. However, this would *not*
+# be sufficient to allow the code to run on CPUs without AVX512 or AVX10,
+# because the code heavily uses several features of these extensions other than
+# the vector length: the increase in the number of SIMD registers from 16 to
+# 32, masking support, and new instructions such as vpternlogd (which can do a
+# three-argument XOR). These features are very useful for AES-GCM.
+
+$flavour = shift;
+$output = shift;
+if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; }
+
+if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) {
+ $win64 = 1;
+ @argregs = ( "%rcx", "%rdx", "%r8", "%r9" );
+}
+else {
+ $win64 = 0;
+ @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" );
+}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+$dir = $1;
+( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate )
+ or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate )
+ or die "can't locate x86_64-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+sub _begin_func {
+ my ( $funcname, $uses_seh ) = @_;
+ $g_cur_func_name = $funcname;
+ $g_cur_func_uses_seh = $uses_seh;
+ @g_cur_func_saved_gpregs = ();
+ @g_cur_func_saved_xmmregs = ();
+ return <<___;
+.globl $funcname
+.type $funcname,\@abi-omnipotent
+.align 32
+$funcname:
+ .cfi_startproc
+ @{[ $uses_seh ? ".seh_startproc" : "" ]}
+ _CET_ENDBR
+___
+}
+
+# Push a list of general purpose registers onto the stack.
+sub _save_gpregs {
+ my @gpregs = @_;
+ my $code = "";
+ die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh;
+ die "_save_gpregs can only be called once per function"
+ if @g_cur_func_saved_gpregs;
+ die "Order must be _save_gpregs, then _save_xmmregs"
+ if @g_cur_func_saved_xmmregs;
+ @g_cur_func_saved_gpregs = @gpregs;
+ for my $reg (@gpregs) {
+ $code .= "push $reg\n";
+ if ($win64) {
+ $code .= ".seh_pushreg $reg\n";
+ }
+ else {
+ $code .= ".cfi_push $reg\n";
+ }
+ }
+ return $code;
+}
+
+# Push a list of xmm registers onto the stack if the target is Windows.
+sub _save_xmmregs {
+ my @xmmregs = @_;
+ my $num_xmmregs = scalar @xmmregs;
+ my $code = "";
+ die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh;
+ die "_save_xmmregs can only be called once per function"
+ if @g_cur_func_saved_xmmregs;
+ if ( $win64 and $num_xmmregs > 0 ) {
+ @g_cur_func_saved_xmmregs = @xmmregs;
+ my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
+ my $alloc_size = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 );
+ $code .= "sub \$$alloc_size, %rsp\n";
+ $code .= ".seh_stackalloc $alloc_size\n";
+ for my $i ( 0 .. $num_xmmregs - 1 ) {
+ my $reg_num = $xmmregs[$i];
+ my $pos = 16 * $i;
+ $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n";
+ $code .= ".seh_savexmm %xmm$reg_num, $pos\n";
+ }
+ }
+ return $code;
+}
+
+sub _end_func {
+ my $code = "";
+
+ # Restore any xmm registers that were saved earlier.
+ my $num_xmmregs = scalar @g_cur_func_saved_xmmregs;
+ if ( $win64 and $num_xmmregs > 0 ) {
+ my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
+ my $alloc_size = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 );
+ for my $i ( 0 .. $num_xmmregs - 1 ) {
+ my $reg_num = $g_cur_func_saved_xmmregs[$i];
+ my $pos = 16 * $i;
+ $code .= "movdqa $pos(%rsp), %xmm$reg_num\n";
+ }
+ $code .= "add $alloc_size, %rsp\n";
+ }
+
+ # Restore any general purpose registers that were saved earlier.
+ for my $reg ( reverse @g_cur_func_saved_gpregs ) {
+ $code .= "pop $reg\n";
+ if ( !$win64 ) {
+ $code .= ".cfi_pop $reg\n";
+ }
+ }
+
+ $code .= <<___;
+ ret
+ @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]}
+ .cfi_endproc
+ .size $g_cur_func_name, . - $g_cur_func_name
+___
+ return $code;
+}
+
+$code = <<___;
+.section .rodata
+.align 64
+
+ # A shuffle mask that reflects the bytes of 16-byte blocks
+.Lbswap_mask:
+ .quad 0x08090a0b0c0d0e0f, 0x0001020304050607
+
+ # This is the GHASH reducing polynomial without its constant term, i.e.
+ # x^128 + x^7 + x^2 + x, represented using the backwards mapping
+ # between bits and polynomial coefficients.
+ #
+ # Alternatively, it can be interpreted as the naturally-ordered
+ # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+ # "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+ .quad 1, 0xc200000000000000
+
+ # Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+ .quad 1, 0xc200000000000001
+
+ # The below constants are used for incrementing the counter blocks.
+ # ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
+ # inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
+ # 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks.
+.Lctr_pattern:
+ .quad 0, 0
+ .quad 1, 0
+.Linc_2blocks:
+ .quad 2, 0
+ .quad 3, 0
+.Linc_4blocks:
+ .quad 4, 0
+
+.text
+___
+
+# Number of powers of the hash key stored in the key struct. The powers are
+# stored from highest (H^NUM_H_POWERS) to lowest (H^1).
+$NUM_H_POWERS = 16;
+
+$OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16;
+
+# Offset to 'rounds' in AES_KEY struct
+$OFFSETOF_AES_ROUNDS = 240;
+
+# The current vector length in bytes
+undef $VL;
+
+# Set the vector length in bytes. This sets the VL variable and defines
+# register aliases V0-V31 that map to the ymm or zmm registers.
+sub _set_veclen {
+ ($VL) = @_;
+ foreach my $i ( 0 .. 31 ) {
+ if ( $VL == 32 ) {
+ ${"V${i}"} = "%ymm${i}";
+ }
+ elsif ( $VL == 64 ) {
+ ${"V${i}"} = "%zmm${i}";
+ }
+ else {
+ die "Unsupported vector length";
+ }
+ }
+}
+
+# The _ghash_mul_step macro does one step of GHASH multiplication of the
+# 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
+# reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the
+# same size as \a and \b. To complete all steps, this must invoked with \i=0
+# through \i=9. The division into steps allows users of this macro to
+# optionally interleave the computation with other instructions. Users of this
+# macro must preserve the parameter registers across steps.
+#
+# The multiplications are done in GHASH's representation of the finite field
+# GF(2^128). Elements of GF(2^128) are represented as binary polynomials
+# (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial
+# G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is
+# just XOR, while multiplication is more complex and has two parts: (a) do
+# carryless multiplication of two 128-bit input polynomials to get a 256-bit
+# intermediate product polynomial, and (b) reduce the intermediate product to
+# 128 bits by adding multiples of G that cancel out terms in it. (Adding
+# multiples of G doesn't change which field element the polynomial represents.)
+#
+# Unfortunately, the GCM specification maps bits to/from polynomial
+# coefficients backwards from the natural order. In each byte it specifies the
+# highest bit to be the lowest order polynomial coefficient, *not* the highest!
+# This makes it nontrivial to work with the GHASH polynomials. We could
+# reflect the bits, but x86 doesn't have an instruction that does that.
+#
+# Instead, we operate on the values without bit-reflecting them. This *mostly*
+# just works, since XOR and carryless multiplication are symmetric with respect
+# to bit order, but it has some consequences. First, due to GHASH's byte
+# order, by skipping bit reflection, *byte* reflection becomes necessary to
+# give the polynomial terms a consistent order. E.g., considering an N-bit
+# value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0
+# through N-1 of the byte-reflected value represent the coefficients of x^(N-1)
+# through x^0, whereas bits 0 through N-1 of the non-byte-reflected value
+# represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked
+# with. Fortunately, x86's vpshufb instruction can do byte reflection.
+#
+# Second, forgoing the bit reflection causes an extra multiple of x (still
+# using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each
+# multiplication. This is because an M-bit by N-bit carryless multiplication
+# really produces a (M+N-1)-bit product, but in practice it's zero-extended to
+# M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits
+# to polynomial coefficients backwards, this zero-extension actually changes
+# the product by introducing an extra factor of x. Therefore, users of this
+# macro must ensure that one of the inputs has an extra factor of x^-1, i.e.
+# the multiplicative inverse of x, to cancel out the extra x.
+#
+# Third, the backwards coefficients convention is just confusing to work with,
+# since it makes "low" and "high" in the polynomial math mean the opposite of
+# their normal meaning in computer programming. This can be solved by using an
+# alternative interpretation: the polynomial coefficients are understood to be
+# in the natural order, and the multiplication is actually \a * \b * x^-128 mod
+# x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs,
+# or the implementation at all; it just changes the mathematical interpretation
+# of what each instruction is doing. Starting from here, we'll use this
+# alternative interpretation, as it's easier to understand the code that way.
+#
+# Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 =>
+# 128-bit carryless multiplication, so we break the 128 x 128 multiplication
+# into parts as follows (the _L and _H suffixes denote low and high 64 bits):
+#
+# LO = a_L * b_L
+# MI = (a_L * b_H) + (a_H * b_L)
+# HI = a_H * b_H
+#
+# The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit.
+# Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and
+# HI right away, since the way the reduction works makes that unnecessary.
+#
+# For the reduction, we cancel out the low 128 bits by adding multiples of G =
+# x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of
+# which cancels out the next lowest 64 bits. Consider a value x^64*A + B,
+# where A and B are 128-bit. Adding B_L*G to that value gives:
+#
+# x^64*A + B + B_L*G
+# = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1)
+# = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L
+# = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L
+# = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57))
+#
+# So: if we sum A, B with its halves swapped, and the low half of B times x^63
+# + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
+# original value x^64*A + B. I.e., the low 64 bits got canceled out.
+#
+# We just need to apply this twice: first to fold LO into MI, and second to
+# fold the updated MI into HI.
+#
+# The needed three-argument XORs are done using the vpternlogd instruction with
+# immediate 0x96, since this is faster than two vpxord instructions.
+#
+# A potential optimization, assuming that b is fixed per-key (if a is fixed
+# per-key it would work the other way around), is to use one iteration of the
+# reduction described above to precompute a value c such that x^64*c = b mod G,
+# and then multiply a_L by c (and implicitly by x^64) instead of by b:
+#
+# MI = (a_L * c_L) + (a_H * b_L)
+# HI = (a_L * c_H) + (a_H * b_H)
+#
+# This would eliminate the LO part of the intermediate product, which would
+# eliminate the need to fold LO into MI. This would save two instructions,
+# including a vpclmulqdq. However, we currently don't use this optimization
+# because it would require twice as many per-key precomputed values.
+#
+# Using Karatsuba multiplication instead of "schoolbook" multiplication
+# similarly would save a vpclmulqdq but does not seem to be worth it.
+sub _ghash_mul_step {
+ my ( $i, $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
+ if ( $i == 0 ) {
+ return "vpclmulqdq \$0x00, $a, $b, $t0\n" . # LO = a_L * b_L
+ "vpclmulqdq \$0x01, $a, $b, $t1\n"; # MI_0 = a_L * b_H
+ }
+ elsif ( $i == 1 ) {
+ return "vpclmulqdq \$0x10, $a, $b, $t2\n"; # MI_1 = a_H * b_L
+ }
+ elsif ( $i == 2 ) {
+ return "vpxord $t2, $t1, $t1\n"; # MI = MI_0 + MI_1
+ }
+ elsif ( $i == 3 ) {
+ return
+ "vpclmulqdq \$0x01, $t0, $gfpoly, $t2\n"; # LO_L*(x^63 + x^62 + x^57)
+ }
+ elsif ( $i == 4 ) {
+ return "vpshufd \$0x4e, $t0, $t0\n"; # Swap halves of LO
+ }
+ elsif ( $i == 5 ) {
+ return "vpternlogd \$0x96, $t2, $t0, $t1\n"; # Fold LO into MI
+ }
+ elsif ( $i == 6 ) {
+ return "vpclmulqdq \$0x11, $a, $b, $dst\n"; # HI = a_H * b_H
+ }
+ elsif ( $i == 7 ) {
+ return
+ "vpclmulqdq \$0x01, $t1, $gfpoly, $t0\n"; # MI_L*(x^63 + x^62 + x^57)
+ }
+ elsif ( $i == 8 ) {
+ return "vpshufd \$0x4e, $t1, $t1\n"; # Swap halves of MI
+ }
+ elsif ( $i == 9 ) {
+ return "vpternlogd \$0x96, $t0, $t1, $dst\n"; # Fold MI into HI
+ }
+}
+
+# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+# the reduced products in \dst. See _ghash_mul_step for full explanation.
+sub _ghash_mul {
+ my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
+ my $code = "";
+ for my $i ( 0 .. 9 ) {
+ $code .= _ghash_mul_step $i, $a, $b, $dst, $gfpoly, $t0, $t1, $t2;
+ }
+ return $code;
+}
+
+# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
+# *unreduced* products to \lo, \mi, and \hi.
+sub _ghash_mul_noreduce {
+ my ( $a, $b, $lo, $mi, $hi, $t0, $t1, $t2, $t3 ) = @_;
+ return <<___;
+ vpclmulqdq \$0x00, $a, $b, $t0 # a_L * b_L
+ vpclmulqdq \$0x01, $a, $b, $t1 # a_L * b_H
+ vpclmulqdq \$0x10, $a, $b, $t2 # a_H * b_L
+ vpclmulqdq \$0x11, $a, $b, $t3 # a_H * b_H
+ vpxord $t0, $lo, $lo
+ vpternlogd \$0x96, $t2, $t1, $mi
+ vpxord $t3, $hi, $hi
+___
+}
+
+# Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
+# reduced products in \hi. See _ghash_mul_step for explanation of reduction.
+sub _ghash_reduce {
+ my ( $lo, $mi, $hi, $gfpoly, $t0 ) = @_;
+ return <<___;
+ vpclmulqdq \$0x01, $lo, $gfpoly, $t0
+ vpshufd \$0x4e, $lo, $lo
+ vpternlogd \$0x96, $t0, $lo, $mi
+ vpclmulqdq \$0x01, $mi, $gfpoly, $t0
+ vpshufd \$0x4e, $mi, $mi
+ vpternlogd \$0x96, $t0, $mi, $hi
+___
+}
+
+$g_init_macro_expansion_count = 0;
+
+# void gcm_init_##suffix(u128 Htable[16], const uint64_t H[2]);
+#
+# Initialize |Htable| with powers of the GHASH subkey |H|.
+#
+# The powers are stored in the order H^NUM_H_POWERS to H^1.
+#
+# This macro supports both VL=32 and VL=64. _set_veclen must have been invoked
+# with the desired length. In the VL=32 case, the function computes twice as
+# many key powers than are actually used by the VL=32 GCM update functions.
+# This is done to keep the key format the same regardless of vector length.
+sub _aes_gcm_init {
+ my $local_label_suffix = "__func" . ++$g_init_macro_expansion_count;
+
+ # Function arguments
+ my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ];
+
+ # Additional local variables. V0-V2 and %rax are used as temporaries.
+ my $POWERS_PTR = "%r8";
+ my $RNDKEYLAST_PTR = "%r9";
+ my ( $H_CUR, $H_CUR_YMM, $H_CUR_XMM ) = ( "$V3", "%ymm3", "%xmm3" );
+ my ( $H_INC, $H_INC_YMM, $H_INC_XMM ) = ( "$V4", "%ymm4", "%xmm4" );
+ my ( $GFPOLY, $GFPOLY_YMM, $GFPOLY_XMM ) = ( "$V5", "%ymm5", "%xmm5" );
+
+ my $code = <<___;
+ # Get pointer to lowest set of key powers (located at end of array).
+ lea $OFFSETOFEND_H_POWERS-$VL($HTABLE), $POWERS_PTR
+
+ # Load the byte-reflected hash subkey. BoringSSL provides it in
+ # byte-reflected form except the two halves are in the wrong order.
+ vpshufd \$0x4e, ($H_PTR), $H_CUR_XMM
+
+ # Finish preprocessing the first key power, H^1. Since this GHASH
+ # implementation operates directly on values with the backwards bit
+ # order specified by the GCM standard, it's necessary to preprocess the
+ # raw key as follows. First, reflect its bytes. Second, multiply it
+ # by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
+ # interpretation of polynomial coefficients), which can also be
+ # interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121
+ # + 1 using the alternative, natural interpretation of polynomial
+ # coefficients. For details, see the comment above _ghash_mul_step.
+ #
+ # Either way, for the multiplication the concrete operation performed
+ # is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
+ # << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit
+ # wide shift instruction, so instead double each of the two 64-bit
+ # halves and incorporate the internal carry bit into the value XOR'd.
+ vpshufd \$0xd3, $H_CUR_XMM, %xmm0
+ vpsrad \$31, %xmm0, %xmm0
+ vpaddq $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM
+ # H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit
+ vpternlogd \$0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, $H_CUR_XMM
+
+ # Load the gfpoly constant.
+ vbroadcasti32x4 .Lgfpoly(%rip), $GFPOLY
+
+ # Square H^1 to get H^2.
+ #
+ # Note that as with H^1, all higher key powers also need an extra
+ # factor of x^-1 (or x using the natural interpretation). Nothing
+ # special needs to be done to make this happen, though: H^1 * H^1 would
+ # end up with two factors of x^-1, but the multiplication consumes one.
+ # So the product H^2 ends up with the desired one factor of x^-1.
+ @{[ _ghash_mul $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM,
+ "%xmm0", "%xmm1", "%xmm2" ]}
+
+ # Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
+ vinserti128 \$1, $H_CUR_XMM, $H_INC_YMM, $H_CUR_YMM
+ vinserti128 \$1, $H_INC_XMM, $H_INC_YMM, $H_INC_YMM
+___
+
+ if ( $VL == 64 ) {
+
+ # Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
+ $code .= <<___;
+ @{[ _ghash_mul $H_INC_YMM, $H_CUR_YMM, $H_INC_YMM, $GFPOLY_YMM,
+ "%ymm0", "%ymm1", "%ymm2" ]}
+ vinserti64x4 \$1, $H_CUR_YMM, $H_INC, $H_CUR
+ vshufi64x2 \$0, $H_INC, $H_INC, $H_INC
+___
+ }
+
+ $code .= <<___;
+ # Store the lowest set of key powers.
+ vmovdqu8 $H_CUR, ($POWERS_PTR)
+
+ # Compute and store the remaining key powers. With VL=32, repeatedly
+ # multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
+ # With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+ # [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
+ mov \$@{[ $NUM_H_POWERS*16/$VL - 1 ]}, %eax
+.Lprecompute_next$local_label_suffix:
+ sub \$$VL, $POWERS_PTR
+ @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR, $GFPOLY, $V0, $V1, $V2 ]}
+ vmovdqu8 $H_CUR, ($POWERS_PTR)
+ dec %eax
+ jnz .Lprecompute_next$local_label_suffix
+
+ vzeroupper # This is needed after using ymm or zmm registers.
+___
+ return $code;
+}
+
+# XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
+# the result in \dst_xmm. This implicitly zeroizes the other lanes of dst.
+sub _horizontal_xor {
+ my ( $src, $src_xmm, $dst_xmm, $t0_xmm, $t1_xmm, $t2_xmm ) = @_;
+ if ( $VL == 32 ) {
+ return <<___;
+ vextracti32x4 \$1, $src, $t0_xmm
+ vpxord $t0_xmm, $src_xmm, $dst_xmm
+___
+ }
+ elsif ( $VL == 64 ) {
+ return <<___;
+ vextracti32x4 \$1, $src, $t0_xmm
+ vextracti32x4 \$2, $src, $t1_xmm
+ vextracti32x4 \$3, $src, $t2_xmm
+ vpxord $t0_xmm, $src_xmm, $dst_xmm
+ vpternlogd \$0x96, $t1_xmm, $t2_xmm, $dst_xmm
+___
+ }
+ else {
+ die "Unsupported vector length";
+ }
+}
+
+# Do one step of the GHASH update of the data blocks given in the vector
+# registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The
+# division into steps allows users of this macro to optionally interleave the
+# computation with other instructions. This macro uses the vector register
+# GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered;
+# H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and
+# GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the
+# data blocks. The parameter registers must be preserved across steps.
+#
+# The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
+# H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
+# operations are vectorized operations on vectors of 16-byte blocks. E.g.,
+# with VL=32 there are 2 blocks per vector and the vectorized terms correspond
+# to the following non-vectorized terms:
+#
+# H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
+# H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
+# H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
+# H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
+#
+# With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
+#
+# More concretely, this code does:
+# - Do vectorized "schoolbook" multiplications to compute the intermediate
+# 256-bit product of each block and its corresponding hash key power.
+# There are 4*VL/16 of these intermediate products.
+# - Sum (XOR) the intermediate 256-bit products across vectors. This leaves
+# VL/16 256-bit intermediate values.
+# - Do a vectorized reduction of these 256-bit intermediate values to
+# 128-bits each. This leaves VL/16 128-bit intermediate values.
+# - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
+#
+# See _ghash_mul_step for the full explanation of the operations performed for
+# each individual finite field multiplication and reduction.
+sub _ghash_step_4x {
+ my ($i) = @_;
+ if ( $i == 0 ) {
+ return <<___;
+ vpshufb $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
+ vpxord $GHASH_ACC, $GHASHDATA0, $GHASHDATA0
+ vpshufb $BSWAP_MASK, $GHASHDATA1, $GHASHDATA1
+ vpshufb $BSWAP_MASK, $GHASHDATA2, $GHASHDATA2
+___
+ }
+ elsif ( $i == 1 ) {
+ return <<___;
+ vpshufb $BSWAP_MASK, $GHASHDATA3, $GHASHDATA3
+ vpclmulqdq \$0x00, $H_POW4, $GHASHDATA0, $GHASH_ACC # LO_0
+ vpclmulqdq \$0x00, $H_POW3, $GHASHDATA1, $GHASHTMP0 # LO_1
+ vpclmulqdq \$0x00, $H_POW2, $GHASHDATA2, $GHASHTMP1 # LO_2
+___
+ }
+ elsif ( $i == 2 ) {
+ return <<___;
+ vpxord $GHASHTMP0, $GHASH_ACC, $GHASH_ACC # sum(LO_{1,0})
+ vpclmulqdq \$0x00, $H_POW1, $GHASHDATA3, $GHASHTMP2 # LO_3
+ vpternlogd \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASH_ACC # LO = sum(LO_{3,2,1,0})
+ vpclmulqdq \$0x01, $H_POW4, $GHASHDATA0, $GHASHTMP0 # MI_0
+___
+ }
+ elsif ( $i == 3 ) {
+ return <<___;
+ vpclmulqdq \$0x01, $H_POW3, $GHASHDATA1, $GHASHTMP1 # MI_1
+ vpclmulqdq \$0x01, $H_POW2, $GHASHDATA2, $GHASHTMP2 # MI_2
+ vpternlogd \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0 # sum(MI_{2,1,0})
+ vpclmulqdq \$0x01, $H_POW1, $GHASHDATA3, $GHASHTMP1 # MI_3
+___
+ }
+ elsif ( $i == 4 ) {
+ return <<___;
+ vpclmulqdq \$0x10, $H_POW4, $GHASHDATA0, $GHASHTMP2 # MI_4
+ vpternlogd \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0 # sum(MI_{4,3,2,1,0})
+ vpclmulqdq \$0x10, $H_POW3, $GHASHDATA1, $GHASHTMP1 # MI_5
+ vpclmulqdq \$0x10, $H_POW2, $GHASHDATA2, $GHASHTMP2 # MI_6
+___
+ }
+ elsif ( $i == 5 ) {
+ return <<___;
+ vpternlogd \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0 # sum(MI_{6,5,4,3,2,1,0})
+ vpclmulqdq \$0x01, $GHASH_ACC, $GFPOLY, $GHASHTMP2 # LO_L*(x^63 + x^62 + x^57)
+ vpclmulqdq \$0x10, $H_POW1, $GHASHDATA3, $GHASHTMP1 # MI_7
+ vpxord $GHASHTMP1, $GHASHTMP0, $GHASHTMP0 # MI = sum(MI_{7,6,5,4,3,2,1,0})
+___
+ }
+ elsif ( $i == 6 ) {
+ return <<___;
+ vpshufd \$0x4e, $GHASH_ACC, $GHASH_ACC # Swap halves of LO
+ vpclmulqdq \$0x11, $H_POW4, $GHASHDATA0, $GHASHDATA0 # HI_0
+ vpclmulqdq \$0x11, $H_POW3, $GHASHDATA1, $GHASHDATA1 # HI_1
+ vpclmulqdq \$0x11, $H_POW2, $GHASHDATA2, $GHASHDATA2 # HI_2
+___
+ }
+ elsif ( $i == 7 ) {
+ return <<___;
+ vpternlogd \$0x96, $GHASHTMP2, $GHASH_ACC, $GHASHTMP0 # Fold LO into MI
+ vpclmulqdq \$0x11, $H_POW1, $GHASHDATA3, $GHASHDATA3 # HI_3
+ vpternlogd \$0x96, $GHASHDATA2, $GHASHDATA1, $GHASHDATA0 # sum(HI_{2,1,0})
+ vpclmulqdq \$0x01, $GHASHTMP0, $GFPOLY, $GHASHTMP1 # MI_L*(x^63 + x^62 + x^57)
+___
+ }
+ elsif ( $i == 8 ) {
+ return <<___;
+ vpxord $GHASHDATA3, $GHASHDATA0, $GHASH_ACC # HI = sum(HI_{3,2,1,0})
+ vpshufd \$0x4e, $GHASHTMP0, $GHASHTMP0 # Swap halves of MI
+ vpternlogd \$0x96, $GHASHTMP1, $GHASHTMP0, $GHASH_ACC # Fold MI into HI
+___
+ }
+ elsif ( $i == 9 ) {
+ return _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
+ $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM;
+ }
+}
+
+# Update GHASH with the blocks given in GHASHDATA[0-3].
+# See _ghash_step_4x for full explanation.
+sub _ghash_4x {
+ my $code = "";
+ for my $i ( 0 .. 9 ) {
+ $code .= _ghash_step_4x $i;
+ }
+ return $code;
+}
+
+$g_ghash_macro_expansion_count = 0;
+
+# void gcm_ghash_##suffix(uint8_t Xi[16], const u128 Htable[16],
+# const uint8_t *in, size_t len);
+#
+# This macro generates the body of a GHASH update function with the above
+# prototype. This macro supports both VL=32 and VL=64. _set_veclen must have
+# been invoked with the desired length.
+#
+# The generated function processes the AAD (Additional Authenticated Data) in
+# GCM. Using the key |Htable|, it updates the GHASH accumulator |Xi| with the
+# data given by |in| and |len|. On the first call, |Xi| must be all zeroes.
+# |len| must be a multiple of 16.
+#
+# This function handles large amounts of AAD efficiently, while also keeping the
+# overhead low for small amounts of AAD which is the common case. TLS uses less
+# than one block of AAD, but (uncommonly) other use cases may use much more.
+sub _ghash_update {
+ my $local_label_suffix = "__func" . ++$g_ghash_macro_expansion_count;
+ my $code = "";
+
+ # Function arguments
+ my ( $GHASH_ACC_PTR, $H_POWERS, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
+
+ # Additional local variables
+ ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V0, "%xmm0" );
+ ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V1, "%xmm1" );
+ ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V2, "%xmm2" );
+ ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V3, "%xmm3" );
+ ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V4, "%xmm4" );
+ ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( $V5, "%xmm5" );
+ ( $H_POW4, $H_POW3, $H_POW2 ) = ( $V6, $V7, $V8 );
+ ( $H_POW1, $H_POW1_XMM ) = ( $V9, "%xmm9" );
+ ( $GFPOLY, $GFPOLY_XMM ) = ( $V10, "%xmm10" );
+ ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V11, $V12, $V13 );
+
+ $code .= <<___;
+ @{[ _save_xmmregs (6 .. 13) ]}
+ .seh_endprologue
+
+ # Load the bswap_mask and gfpoly constants. Since AADLEN is usually small,
+ # usually only 128-bit vectors will be used. So as an optimization, don't
+ # broadcast these constants to all 128-bit lanes quite yet.
+ vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK_XMM
+ vmovdqu .Lgfpoly(%rip), $GFPOLY_XMM
+
+ # Load the GHASH accumulator.
+ vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM
+ vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+
+ # Optimize for AADLEN < VL by checking for AADLEN < VL before AADLEN < 4*VL.
+ cmp \$$VL, $AADLEN
+ jb .Laad_blockbyblock$local_label_suffix
+
+ # AADLEN >= VL, so we'll operate on full vectors. Broadcast bswap_mask and
+ # gfpoly to all 128-bit lanes.
+ vshufi64x2 \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
+ vshufi64x2 \$0, $GFPOLY, $GFPOLY, $GFPOLY
+
+ # Load the lowest set of key powers.
+ vmovdqu8 $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1
+
+ cmp \$4*$VL-1, $AADLEN
+ jbe .Laad_loop_1x$local_label_suffix
+
+ # AADLEN >= 4*VL. Load the higher key powers.
+ vmovdqu8 $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4
+ vmovdqu8 $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3
+ vmovdqu8 $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2
+
+ # Update GHASH with 4*VL bytes of AAD at a time.
+.Laad_loop_4x$local_label_suffix:
+ vmovdqu8 0*$VL($AAD), $GHASHDATA0
+ vmovdqu8 1*$VL($AAD), $GHASHDATA1
+ vmovdqu8 2*$VL($AAD), $GHASHDATA2
+ vmovdqu8 3*$VL($AAD), $GHASHDATA3
+ @{[ _ghash_4x ]}
+ sub \$-4*$VL, $AAD # shorter than 'add 4*VL' when VL=32
+ add \$-4*$VL, $AADLEN
+ cmp \$4*$VL-1, $AADLEN
+ ja .Laad_loop_4x$local_label_suffix
+
+ # Update GHASH with VL bytes of AAD at a time.
+ cmp \$$VL, $AADLEN
+ jb .Laad_large_done$local_label_suffix
+.Laad_loop_1x$local_label_suffix:
+ vmovdqu8 ($AAD), $GHASHDATA0
+ vpshufb $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
+ vpxord $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
+ @{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY,
+ $GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
+ @{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
+ $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
+ add \$$VL, $AAD
+ sub \$$VL, $AADLEN
+ cmp \$$VL, $AADLEN
+ jae .Laad_loop_1x$local_label_suffix
+
+.Laad_large_done$local_label_suffix:
+ # Issue the vzeroupper that is needed after using ymm or zmm registers.
+ # Do it here instead of at the end, to minimize overhead for small AADLEN.
+ vzeroupper
+
+ # GHASH the remaining data 16 bytes at a time, using xmm registers only.
+.Laad_blockbyblock$local_label_suffix:
+ test $AADLEN, $AADLEN
+ jz .Laad_done$local_label_suffix
+ vmovdqu $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1_XMM
+.Laad_loop_blockbyblock$local_label_suffix:
+ vmovdqu ($AAD), $GHASHDATA0_XMM
+ vpshufb $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM
+ vpxor $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+ @{[ _ghash_mul $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
+ $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
+ add \$16, $AAD
+ sub \$16, $AADLEN
+ jnz .Laad_loop_blockbyblock$local_label_suffix
+
+.Laad_done$local_label_suffix:
+ # Store the updated GHASH accumulator back to memory.
+ vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+ vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+___
+ return $code;
+}
+
+# Do one non-last round of AES encryption on the counter blocks in V0-V3 using
+# the round key that has been broadcast to all 128-bit lanes of \round_key.
+sub _vaesenc_4x {
+ my ($round_key) = @_;
+ return <<___;
+ vaesenc $round_key, $V0, $V0
+ vaesenc $round_key, $V1, $V1
+ vaesenc $round_key, $V2, $V2
+ vaesenc $round_key, $V3, $V3
+___
+}
+
+# Start the AES encryption of four vectors of counter blocks.
+sub _ctr_begin_4x {
+ return <<___;
+ # Increment LE_CTR four times to generate four vectors of little-endian
+ # counter blocks, swap each to big-endian, and store them in V0-V3.
+ vpshufb $BSWAP_MASK, $LE_CTR, $V0
+ vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR
+ vpshufb $BSWAP_MASK, $LE_CTR, $V1
+ vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR
+ vpshufb $BSWAP_MASK, $LE_CTR, $V2
+ vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR
+ vpshufb $BSWAP_MASK, $LE_CTR, $V3
+ vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR
+
+ # AES "round zero": XOR in the zero-th round key.
+ vpxord $RNDKEY0, $V0, $V0
+ vpxord $RNDKEY0, $V1, $V1
+ vpxord $RNDKEY0, $V2, $V2
+ vpxord $RNDKEY0, $V3, $V3
+___
+}
+
+# Do the last AES round for four vectors of counter blocks V0-V3, XOR source
+# data with the resulting keystream, and write the result to DST and
+# GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.)
+sub _aesenclast_and_xor_4x {
+ return <<___;
+ # XOR the source data with the last round key, saving the result in
+ # GHASHDATA[0-3]. This reduces latency by taking advantage of the
+ # property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+ vpxord 0*$VL($SRC), $RNDKEYLAST, $GHASHDATA0
+ vpxord 1*$VL($SRC), $RNDKEYLAST, $GHASHDATA1
+ vpxord 2*$VL($SRC), $RNDKEYLAST, $GHASHDATA2
+ vpxord 3*$VL($SRC), $RNDKEYLAST, $GHASHDATA3
+
+ # Do the last AES round. This handles the XOR with the source data
+ # too, as per the optimization described above.
+ vaesenclast $GHASHDATA0, $V0, $GHASHDATA0
+ vaesenclast $GHASHDATA1, $V1, $GHASHDATA1
+ vaesenclast $GHASHDATA2, $V2, $GHASHDATA2
+ vaesenclast $GHASHDATA3, $V3, $GHASHDATA3
+
+ # Store the en/decrypted data to DST.
+ vmovdqu8 $GHASHDATA0, 0*$VL($DST)
+ vmovdqu8 $GHASHDATA1, 1*$VL($DST)
+ vmovdqu8 $GHASHDATA2, 2*$VL($DST)
+ vmovdqu8 $GHASHDATA3, 3*$VL($DST)
+___
+}
+
+$g_update_macro_expansion_count = 0;
+
+# void aes_gcm_{enc,dec}_update_##suffix(const uint8_t *in, uint8_t *out,
+# size_t len, const AES_KEY *key,
+# const uint8_t ivec[16],
+# const u128 Htable[16],
+# uint8_t Xi[16]);
+#
+# This macro generates a GCM encryption or decryption update function with the
+# above prototype (with \enc selecting which one). This macro supports both
+# VL=32 and VL=64. _set_veclen must have been invoked with the desired length.
+#
+# This function computes the next portion of the CTR keystream, XOR's it with
+# |len| bytes from |in|, and writes the resulting encrypted or decrypted data
+# to |out|. It also updates the GHASH accumulator |Xi| using the next |len|
+# ciphertext bytes.
+#
+# |len| must be a multiple of 16, except on the last call where it can be any
+# length. The caller must do any buffering needed to ensure this. Both
+# in-place and out-of-place en/decryption are supported.
+#
+# |ivec| must give the current counter in big-endian format. This function
+# loads the counter from |ivec| and increments the loaded counter as needed, but
+# it does *not* store the updated counter back to |ivec|. The caller must
+# update |ivec| if any more data segments follow. Internally, only the low
+# 32-bit word of the counter is incremented, following the GCM standard.
+sub _aes_gcm_update {
+ my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count;
+
+ my ($enc) = @_;
+
+ my $code = "";
+
+ # Function arguments
+ ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $H_POWERS, $GHASH_ACC_PTR ) =
+ $win64
+ ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" )
+ : ( @argregs[ 0 .. 5 ], "%r12" );
+
+ # Additional local variables
+
+ # %rax, %k1, and %k2 are used as temporary registers. BE_CTR_PTR is
+ # also available as a temporary register after the counter is loaded.
+
+ # AES key length in bytes
+ ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" );
+
+ # Pointer to the last AES round key for the chosen AES variant
+ $RNDKEYLAST_PTR = "%r11";
+
+ # In the main loop, V0-V3 are used as AES input and output. Elsewhere
+ # they are used as temporary registers.
+
+ # GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
+ ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V4, "%xmm4" );
+ ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V5, "%xmm5" );
+ ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V6, "%xmm6" );
+ ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V7, "%xmm7" );
+
+ # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+ # using vpshufb, copied to all 128-bit lanes.
+ ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V8, "%xmm8" );
+
+ # RNDKEY temporarily holds the next AES round key.
+ $RNDKEY = $V9;
+
+ # GHASH_ACC is the accumulator variable for GHASH. When fully reduced,
+ # only the lowest 128-bit lane can be nonzero. When not fully reduced,
+ # more than one lane may be used, and they need to be XOR'd together.
+ ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( $V10, "%xmm10" );
+
+ # LE_CTR_INC is the vector of 32-bit words that need to be added to a
+ # vector of little-endian counter blocks to advance it forwards.
+ $LE_CTR_INC = $V11;
+
+ # LE_CTR contains the next set of little-endian counter blocks.
+ $LE_CTR = $V12;
+
+ # RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
+ # copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,
+ # RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
+ (
+ $RNDKEY0, $RNDKEYLAST, $RNDKEY_M9, $RNDKEY_M8,
+ $RNDKEY_M7, $RNDKEY_M6, $RNDKEY_M5, $RNDKEY_M4,
+ $RNDKEY_M3, $RNDKEY_M2, $RNDKEY_M1
+ ) = ( $V13, $V14, $V15, $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23 );
+
+ # GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These
+ # cannot coincide with anything used for AES encryption, since for
+ # performance reasons GHASH and AES encryption are interleaved.
+ ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V24, $V25, $V26 );
+
+ # H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The
+ # descending numbering reflects the order of the key powers.
+ ( $H_POW4, $H_POW3, $H_POW2, $H_POW1 ) = ( $V27, $V28, $V29, $V30 );
+
+ # GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
+ $GFPOLY = $V31;
+
+ if ($win64) {
+ $code .= <<___;
+ @{[ _save_gpregs $BE_CTR_PTR, $H_POWERS, $GHASH_ACC_PTR ]}
+ mov 64(%rsp), $BE_CTR_PTR # arg5
+ mov 72(%rsp), $H_POWERS # arg6
+ mov 80(%rsp), $GHASH_ACC_PTR # arg7
+ @{[ _save_xmmregs (6 .. 15) ]}
+ .seh_endprologue
+___
+ }
+ else {
+ $code .= <<___;
+ @{[ _save_gpregs $GHASH_ACC_PTR ]}
+ mov 16(%rsp), $GHASH_ACC_PTR # arg7
+___
+ }
+
+ if ($enc) {
+ $code .= <<___;
+#ifdef BORINGSSL_DISPATCH_TEST
+ .extern BORINGSSL_function_hit
+ movb \$1,BORINGSSL_function_hit+@{[ $VL < 64 ? 6 : 7 ]}(%rip)
+#endif
+___
+ }
+ $code .= <<___;
+ # Load some constants.
+ vbroadcasti32x4 .Lbswap_mask(%rip), $BSWAP_MASK
+ vbroadcasti32x4 .Lgfpoly(%rip), $GFPOLY
+
+ # Load the GHASH accumulator and the starting counter.
+ # BoringSSL passes these values in big endian format.
+ vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM
+ vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+ vbroadcasti32x4 ($BE_CTR_PTR), $LE_CTR
+ vpshufb $BSWAP_MASK, $LE_CTR, $LE_CTR
+
+ # Load the AES key length in bytes. BoringSSL stores number of rounds
+ # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20.
+ movl $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN
+ lea -20(,$AESKEYLEN,4), $AESKEYLEN
+
+ # Make RNDKEYLAST_PTR point to the last AES round key. This is the
+ # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+ # respectively. Then load the zero-th and last round keys.
+ lea 6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR
+ vbroadcasti32x4 ($AESKEY), $RNDKEY0
+ vbroadcasti32x4 ($RNDKEYLAST_PTR), $RNDKEYLAST
+
+ # Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
+ vpaddd .Lctr_pattern(%rip), $LE_CTR, $LE_CTR
+
+ # Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
+ vbroadcasti32x4 .Linc_@{[ $VL / 16 ]}blocks(%rip), $LE_CTR_INC
+
+ # If there are at least 4*VL bytes of data, then continue into the loop
+ # that processes 4*VL bytes of data at a time. Otherwise skip it.
+ cmp \$4*$VL-1, $DATALEN
+ jbe .Lcrypt_loop_4x_done$local_label_suffix
+
+ # Load powers of the hash key.
+ vmovdqu8 $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4
+ vmovdqu8 $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3
+ vmovdqu8 $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2
+ vmovdqu8 $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1
+___
+
+ # Main loop: en/decrypt and hash 4 vectors at a time.
+ #
+ # When possible, interleave the AES encryption of the counter blocks
+ # with the GHASH update of the ciphertext blocks. This improves
+ # performance on many CPUs because the execution ports used by the VAES
+ # instructions often differ from those used by vpclmulqdq and other
+ # instructions used in GHASH. For example, many Intel CPUs dispatch
+ # vaesenc to ports 0 and 1 and vpclmulqdq to port 5.
+ #
+ # The interleaving is easiest to do during decryption, since during
+ # decryption the ciphertext blocks are immediately available. For
+ # encryption, instead encrypt the first set of blocks, then hash those
+ # blocks while encrypting the next set of blocks, repeat that as
+ # needed, and finally hash the last set of blocks.
+
+ if ($enc) {
+ $code .= <<___;
+ # Encrypt the first 4 vectors of plaintext blocks. Leave the resulting
+ # ciphertext in GHASHDATA[0-3] for GHASH.
+ @{[ _ctr_begin_4x ]}
+ lea 16($AESKEY), %rax
+.Lvaesenc_loop_first_4_vecs$local_label_suffix:
+ vbroadcasti32x4 (%rax), $RNDKEY
+ @{[ _vaesenc_4x $RNDKEY ]}
+ add \$16, %rax
+ cmp %rax, $RNDKEYLAST_PTR
+ jne .Lvaesenc_loop_first_4_vecs$local_label_suffix
+ @{[ _aesenclast_and_xor_4x ]}
+ sub \$-4*$VL, $SRC # shorter than 'add 4*VL' when VL=32
+ sub \$-4*$VL, $DST
+ add \$-4*$VL, $DATALEN
+ cmp \$4*$VL-1, $DATALEN
+ jbe .Lghash_last_ciphertext_4x$local_label_suffix
+___
+ }
+
+ # Cache as many additional AES round keys as possible.
+ for my $i ( reverse 1 .. 9 ) {
+ $code .= <<___;
+ vbroadcasti32x4 -$i*16($RNDKEYLAST_PTR), ${"RNDKEY_M$i"}
+___
+ }
+
+ $code .= <<___;
+.Lcrypt_loop_4x$local_label_suffix:
+___
+
+ # If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If
+ # encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
+ if ( !$enc ) {
+ $code .= <<___;
+ vmovdqu8 0*$VL($SRC), $GHASHDATA0
+ vmovdqu8 1*$VL($SRC), $GHASHDATA1
+ vmovdqu8 2*$VL($SRC), $GHASHDATA2
+ vmovdqu8 3*$VL($SRC), $GHASHDATA3
+___
+ }
+
+ $code .= <<___;
+ # Start the AES encryption of the counter blocks.
+ @{[ _ctr_begin_4x ]}
+ cmp \$24, $AESKEYLEN
+ jl .Laes128$local_label_suffix
+ je .Laes192$local_label_suffix
+ # AES-256
+ vbroadcasti32x4 -13*16($RNDKEYLAST_PTR), $RNDKEY
+ @{[ _vaesenc_4x $RNDKEY ]}
+ vbroadcasti32x4 -12*16($RNDKEYLAST_PTR), $RNDKEY
+ @{[ _vaesenc_4x $RNDKEY ]}
+.Laes192$local_label_suffix:
+ vbroadcasti32x4 -11*16($RNDKEYLAST_PTR), $RNDKEY
+ @{[ _vaesenc_4x $RNDKEY ]}
+ vbroadcasti32x4 -10*16($RNDKEYLAST_PTR), $RNDKEY
+ @{[ _vaesenc_4x $RNDKEY ]}
+.Laes128$local_label_suffix:
+___
+
+ # Finish the AES encryption of the counter blocks in V0-V3, interleaved
+ # with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
+ for my $i ( reverse 1 .. 9 ) {
+ $code .= <<___;
+ @{[ _ghash_step_4x (9 - $i) ]}
+ @{[ _vaesenc_4x ${"RNDKEY_M$i"} ]}
+___
+ }
+ $code .= <<___;
+ @{[ _ghash_step_4x 9 ]}
+ @{[ _aesenclast_and_xor_4x ]}
+ sub \$-4*$VL, $SRC # shorter than 'add 4*VL' when VL=32
+ sub \$-4*$VL, $DST
+ add \$-4*$VL, $DATALEN
+ cmp \$4*$VL-1, $DATALEN
+ ja .Lcrypt_loop_4x$local_label_suffix
+___
+
+ if ($enc) {
+
+ # Update GHASH with the last set of ciphertext blocks.
+ $code .= <<___;
+.Lghash_last_ciphertext_4x$local_label_suffix:
+ @{[ _ghash_4x ]}
+___
+ }
+
+ my $POWERS_PTR = $BE_CTR_PTR; # BE_CTR_PTR is free to be reused.
+
+ $code .= <<___;
+.Lcrypt_loop_4x_done$local_label_suffix:
+ # Check whether any data remains.
+ test $DATALEN, $DATALEN
+ jz .Ldone$local_label_suffix
+
+ # The data length isn't a multiple of 4*VL. Process the remaining data
+ # of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
+ # Going one vector at a time may seem inefficient compared to having
+ # separate code paths for each possible number of vectors remaining.
+ # However, using a loop keeps the code size down, and it performs
+ # surprising well; modern CPUs will start executing the next iteration
+ # before the previous one finishes and also predict the number of loop
+ # iterations. For a similar reason, we roll up the AES rounds.
+ #
+ # On the last iteration, the remaining length may be less than VL.
+ # Handle this using masking.
+ #
+ # Since there are enough key powers available for all remaining data,
+ # there is no need to do a GHASH reduction after each iteration.
+ # Instead, multiply each remaining block by its own key power, and only
+ # do a GHASH reduction at the very end.
+
+ # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+ # is the number of blocks that remain.
+ mov $DATALEN, %rax
+ neg %rax
+ and \$-16, %rax # -round_up(DATALEN, 16)
+ lea $OFFSETOFEND_H_POWERS($H_POWERS,%rax), $POWERS_PTR
+___
+
+ # Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+ my ( $LO, $LO_XMM ) = ( $GHASHDATA0, $GHASHDATA0_XMM );
+ my ( $MI, $MI_XMM ) = ( $GHASHDATA1, $GHASHDATA1_XMM );
+ my ( $HI, $HI_XMM ) = ( $GHASHDATA2, $GHASHDATA2_XMM );
+ $code .= <<___;
+ vpxor $LO_XMM, $LO_XMM, $LO_XMM
+ vpxor $MI_XMM, $MI_XMM, $MI_XMM
+ vpxor $HI_XMM, $HI_XMM, $HI_XMM
+
+ cmp \$$VL, $DATALEN
+ jb .Lpartial_vec$local_label_suffix
+
+.Lcrypt_loop_1x$local_label_suffix:
+ # Process a full vector of length VL.
+
+ # Encrypt a vector of counter blocks.
+ vpshufb $BSWAP_MASK, $LE_CTR, $V0
+ vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR
+ vpxord $RNDKEY0, $V0, $V0
+ lea 16($AESKEY), %rax
+.Lvaesenc_loop_tail_full_vec$local_label_suffix:
+ vbroadcasti32x4 (%rax), $RNDKEY
+ vaesenc $RNDKEY, $V0, $V0
+ add \$16, %rax
+ cmp %rax, $RNDKEYLAST_PTR
+ jne .Lvaesenc_loop_tail_full_vec$local_label_suffix
+ vaesenclast $RNDKEYLAST, $V0, $V0
+
+ # XOR the data with the vector of keystream blocks.
+ vmovdqu8 ($SRC), $V1
+ vpxord $V1, $V0, $V0
+ vmovdqu8 $V0, ($DST)
+
+ # Update GHASH with the ciphertext blocks, without reducing.
+ vmovdqu8 ($POWERS_PTR), $H_POW1
+ vpshufb $BSWAP_MASK, @{[ $enc ? $V0 : $V1 ]}, $V0
+ vpxord $GHASH_ACC, $V0, $V0
+ @{[ _ghash_mul_noreduce $H_POW1, $V0, $LO, $MI, $HI, $GHASHDATA3,
+ $V1, $V2, $V3 ]}
+ vpxor $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+
+ add \$$VL, $POWERS_PTR
+ add \$$VL, $SRC
+ add \$$VL, $DST
+ sub \$$VL, $DATALEN
+ cmp \$$VL, $DATALEN
+ jae .Lcrypt_loop_1x$local_label_suffix
+
+ test $DATALEN, $DATALEN
+ jz .Lreduce$local_label_suffix
+
+.Lpartial_vec$local_label_suffix:
+ # Process a partial vector of length 1 <= DATALEN < VL.
+
+ # Set the data mask %k1 to DATALEN 1's.
+ # Set the key powers mask %k2 to round_up(DATALEN, 16) 1's.
+ mov \$-1, %rax
+ bzhi $DATALEN, %rax, %rax
+ @{[ $VL < 64 ? "kmovd %eax, %k1" : "kmovq %rax, %k1" ]}
+ add \$15, $DATALEN
+ and \$-16, $DATALEN
+ mov \$-1, %rax
+ bzhi $DATALEN, %rax, %rax
+ @{[ $VL < 64 ? "kmovd %eax, %k2" : "kmovq %rax, %k2" ]}
+
+ # Encrypt one last vector of counter blocks. This does not need to be
+ # masked. The counter does not need to be incremented here.
+ vpshufb $BSWAP_MASK, $LE_CTR, $V0
+ vpxord $RNDKEY0, $V0, $V0
+ lea 16($AESKEY), %rax
+.Lvaesenc_loop_tail_partialvec$local_label_suffix:
+ vbroadcasti32x4 (%rax), $RNDKEY
+ vaesenc $RNDKEY, $V0, $V0
+ add \$16, %rax
+ cmp %rax, $RNDKEYLAST_PTR
+ jne .Lvaesenc_loop_tail_partialvec$local_label_suffix
+ vaesenclast $RNDKEYLAST, $V0, $V0
+
+ # XOR the data with the appropriate number of keystream bytes.
+ vmovdqu8 ($SRC), $V1\{%k1}{z}
+ vpxord $V1, $V0, $V0
+ vmovdqu8 $V0, ($DST){%k1}
+
+ # Update GHASH with the ciphertext block(s), without reducing.
+ #
+ # In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
+ # (If decrypting, it's done by the above masked load. If encrypting,
+ # it's done by the below masked register-to-register move.) Note that
+ # if DATALEN <= VL - 16, there will be additional padding beyond the
+ # padding of the last block specified by GHASH itself; i.e., there may
+ # be whole block(s) that get processed by the GHASH multiplication and
+ # reduction instructions but should not actually be included in the
+ # GHASH. However, any such blocks are all-zeroes, and the values that
+ # they're multiplied with are also all-zeroes. Therefore they just add
+ # 0 * 0 = 0 to the final GHASH result, which makes no difference.
+ vmovdqu8 ($POWERS_PTR), $H_POW1\{%k2}{z}
+ @{[ $enc ? "vmovdqu8 $V0, $V1\{%k1}{z}" : "" ]}
+ vpshufb $BSWAP_MASK, $V1, $V0
+ vpxord $GHASH_ACC, $V0, $V0
+ @{[ _ghash_mul_noreduce $H_POW1, $V0, $LO, $MI, $HI, $GHASHDATA3,
+ $V1, $V2, $V3 ]}
+
+.Lreduce$local_label_suffix:
+ # Finally, do the GHASH reduction.
+ @{[ _ghash_reduce $LO, $MI, $HI, $GFPOLY, $V0 ]}
+ @{[ _horizontal_xor $HI, $HI_XMM, $GHASH_ACC_XMM,
+ "%xmm0", "%xmm1", "%xmm2" ]}
+
+.Ldone$local_label_suffix:
+ # Store the updated GHASH accumulator back to memory.
+ vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+ vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+ vzeroupper # This is needed after using ymm or zmm registers.
+___
+ return $code;
+}
+
+# void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
+$code .= _begin_func "gcm_gmult_vpclmulqdq_avx10", 1;
+{
+ my ( $GHASH_ACC_PTR, $H_POWERS ) = @argregs[ 0 .. 1 ];
+ my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
+ map( "%xmm$_", ( 0 .. 6 ) );
+
+ $code .= <<___;
+ @{[ _save_xmmregs (6) ]}
+ .seh_endprologue
+
+ vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC
+ vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK
+ vmovdqu $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1
+ vmovdqu .Lgfpoly(%rip), $GFPOLY
+ vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
+
+ @{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}
+
+ vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
+ vmovdqu $GHASH_ACC, ($GHASH_ACC_PTR)
+___
+}
+$code .= _end_func;
+
+_set_veclen 32;
+
+$code .= _begin_func "gcm_init_vpclmulqdq_avx10", 0;
+$code .= _aes_gcm_init;
+$code .= _end_func;
+
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_256", 1;
+$code .= _ghash_update;
+$code .= _end_func;
+
+$code .= _begin_func "aes_gcm_enc_update_vaes_avx10_256", 1;
+$code .= _aes_gcm_update 1;
+$code .= _end_func;
+
+$code .= _begin_func "aes_gcm_dec_update_vaes_avx10_256", 1;
+$code .= _aes_gcm_update 0;
+$code .= _end_func;
+
+_set_veclen 64;
+
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512", 1;
+$code .= _ghash_update;
+$code .= _end_func;
+
+$code .= _begin_func "aes_gcm_enc_update_vaes_avx10_512", 1;
+$code .= _aes_gcm_update 1;
+$code .= _end_func;
+
+$code .= _begin_func "aes_gcm_dec_update_vaes_avx10_512", 1;
+$code .= _aes_gcm_update 0;
+$code .= _end_func;
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
+exit 0;
diff --git a/crypto/fipsmodule/modes/gcm.cc.inc b/crypto/fipsmodule/modes/gcm.cc.inc
index 8413951..2d67eea 100644
--- a/crypto/fipsmodule/modes/gcm.cc.inc
+++ b/crypto/fipsmodule/modes/gcm.cc.inc
@@ -135,14 +135,42 @@
#if defined(HW_GCM) && defined(OPENSSL_X86_64)
static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
- uint8_t Xi[16], const u128 Htable[16]) {
- return aesni_gcm_encrypt(in, out, len, key, ivec, Htable, Xi);
+ uint8_t Xi[16], const u128 Htable[16],
+ enum gcm_impl_t impl) {
+ switch (impl) {
+ case gcm_x86_vaes_avx10_256:
+ len &= kSizeTWithoutLower4Bits;
+ aes_gcm_enc_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi);
+ CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+ return len;
+ case gcm_x86_vaes_avx10_512:
+ len &= kSizeTWithoutLower4Bits;
+ aes_gcm_enc_update_vaes_avx10_512(in, out, len, key, ivec, Htable, Xi);
+ CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+ return len;
+ default:
+ return aesni_gcm_encrypt(in, out, len, key, ivec, Htable, Xi);
+ }
}
static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
- uint8_t Xi[16], const u128 Htable[16]) {
- return aesni_gcm_decrypt(in, out, len, key, ivec, Htable, Xi);
+ uint8_t Xi[16], const u128 Htable[16],
+ enum gcm_impl_t impl) {
+ switch (impl) {
+ case gcm_x86_vaes_avx10_256:
+ len &= kSizeTWithoutLower4Bits;
+ aes_gcm_dec_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi);
+ CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+ return len;
+ case gcm_x86_vaes_avx10_512:
+ len &= kSizeTWithoutLower4Bits;
+ aes_gcm_dec_update_vaes_avx10_512(in, out, len, key, ivec, Htable, Xi);
+ CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+ return len;
+ default:
+ return aesni_gcm_decrypt(in, out, len, key, ivec, Htable, Xi);
+ }
}
#endif // HW_GCM && X86_64
@@ -150,7 +178,8 @@
static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
- uint8_t Xi[16], const u128 Htable[16]) {
+ uint8_t Xi[16], const u128 Htable[16],
+ enum gcm_impl_t impl) {
const size_t len_blocks = len & kSizeTWithoutLower4Bits;
if (!len_blocks) {
return 0;
@@ -161,7 +190,8 @@
static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
- uint8_t Xi[16], const u128 Htable[16]) {
+ uint8_t Xi[16], const u128 Htable[16],
+ enum gcm_impl_t impl) {
const size_t len_blocks = len & kSizeTWithoutLower4Bits;
if (!len_blocks) {
return 0;
@@ -173,21 +203,28 @@
#endif // HW_GCM && AARCH64
void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
- u128 out_table[16], int *out_is_avx,
- const uint8_t gcm_key[16]) {
- *out_is_avx = 0;
-
+ u128 out_table[16], const uint8_t gcm_key[16]) {
// H is passed to |gcm_init_*| as a pair of byte-swapped, 64-bit values.
uint64_t H[2] = {CRYPTO_load_u64_be(gcm_key),
CRYPTO_load_u64_be(gcm_key + 8)};
#if defined(GHASH_ASM_X86_64)
if (crypto_gcm_clmul_enabled()) {
+ if (CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
+ CRYPTO_is_VPCLMULQDQ_capable() && CRYPTO_is_BMI2_capable()) {
+ gcm_init_vpclmulqdq_avx10(out_table, H);
+ *out_mult = gcm_gmult_vpclmulqdq_avx10;
+ if (CRYPTO_cpu_avoid_zmm_registers()) {
+ *out_hash = gcm_ghash_vpclmulqdq_avx10_256;
+ } else {
+ *out_hash = gcm_ghash_vpclmulqdq_avx10_512;
+ }
+ return;
+ }
if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
gcm_init_avx(out_table, H);
*out_mult = gcm_gmult_avx;
*out_hash = gcm_ghash_avx;
- *out_is_avx = 1;
return;
}
gcm_init_clmul(out_table, H);
@@ -244,14 +281,25 @@
OPENSSL_memset(ghash_key, 0, sizeof(ghash_key));
(*block)(ghash_key, ghash_key, aes_key);
- int is_avx;
- CRYPTO_ghash_init(&gcm_key->gmult, &gcm_key->ghash, gcm_key->Htable, &is_avx,
+ CRYPTO_ghash_init(&gcm_key->gmult, &gcm_key->ghash, gcm_key->Htable,
ghash_key);
-#if defined(OPENSSL_AARCH64) && !defined(OPENSSL_NO_ASM)
- gcm_key->use_hw_gcm_crypt = (gcm_pmull_capable() && block_is_hwaes) ? 1 : 0;
-#else
- gcm_key->use_hw_gcm_crypt = (is_avx && block_is_hwaes) ? 1 : 0;
+#if !defined(OPENSSL_NO_ASM)
+#if defined(OPENSSL_X86_64)
+ if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx10_256 &&
+ CRYPTO_is_VAES_capable()) {
+ gcm_key->impl = gcm_x86_vaes_avx10_256;
+ } else if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx10_512 &&
+ CRYPTO_is_VAES_capable()) {
+ gcm_key->impl = gcm_x86_vaes_avx10_512;
+ } else if (gcm_key->ghash == gcm_ghash_avx && block_is_hwaes) {
+ gcm_key->impl = gcm_x86_aesni;
+ }
+#elif defined(OPENSSL_AARCH64)
+ if (gcm_pmull_capable() && block_is_hwaes) {
+ gcm_key->impl = gcm_arm64_aes;
+ }
+#endif
#endif
}
@@ -565,11 +613,11 @@
#if defined(HW_GCM)
// Check |len| to work around a C language bug. See https://crbug.com/1019588.
- if (ctx->gcm_key.use_hw_gcm_crypt && len > 0) {
+ if (ctx->gcm_key.impl != gcm_separate && len > 0) {
// |hw_gcm_encrypt| may not process all the input given to it. It may
// not process *any* of its input if it is deemed too small.
size_t bulk = hw_gcm_encrypt(in, out, len, key, ctx->Yi, ctx->Xi,
- ctx->gcm_key.Htable);
+ ctx->gcm_key.Htable, ctx->gcm_key.impl);
in += bulk;
out += bulk;
len -= bulk;
@@ -654,11 +702,11 @@
#if defined(HW_GCM)
// Check |len| to work around a C language bug. See https://crbug.com/1019588.
- if (ctx->gcm_key.use_hw_gcm_crypt && len > 0) {
+ if (ctx->gcm_key.impl != gcm_separate && len > 0) {
// |hw_gcm_decrypt| may not process all the input given to it. It may
// not process *any* of its input if it is deemed too small.
size_t bulk = hw_gcm_decrypt(in, out, len, key, ctx->Yi, ctx->Xi,
- ctx->gcm_key.Htable);
+ ctx->gcm_key.Htable, ctx->gcm_key.impl);
in += bulk;
out += bulk;
len -= bulk;
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index 1729e0d..53415f4 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -81,6 +81,45 @@
}
}
}
+ if (CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() &&
+ CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
+ CRYPTO_is_BMI2_capable()) {
+ AES_KEY aes_key;
+ static const uint8_t kKey[16] = {0};
+ uint8_t iv[16] = {0};
+
+ CHECK_ABI_SEH(gcm_init_vpclmulqdq_avx10, Htable, kH);
+ CHECK_ABI_SEH(gcm_gmult_vpclmulqdq_avx10, X, Htable);
+ for (size_t blocks : kBlockCounts) {
+ CHECK_ABI_SEH(gcm_ghash_vpclmulqdq_avx10_256, X, Htable, buf,
+ 16 * blocks);
+ CHECK_ABI_SEH(gcm_ghash_vpclmulqdq_avx10_512, X, Htable, buf,
+ 16 * blocks);
+ }
+
+ aes_hw_set_encrypt_key(kKey, 128, &aes_key);
+ for (size_t blocks : kBlockCounts) {
+ CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_256, buf, buf, blocks * 16,
+ &aes_key, iv, Htable, X);
+ CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_256, buf, buf,
+ blocks * 16 + 7, &aes_key, iv, Htable, X);
+ CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_512, buf, buf, blocks * 16,
+ &aes_key, iv, Htable, X);
+ CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_512, buf, buf,
+ blocks * 16 + 7, &aes_key, iv, Htable, X);
+ }
+ aes_hw_set_decrypt_key(kKey, 128, &aes_key);
+ for (size_t blocks : kBlockCounts) {
+ CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_256, buf, buf, blocks * 16,
+ &aes_key, iv, Htable, X);
+ CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_256, buf, buf,
+ blocks * 16 + 7, &aes_key, iv, Htable, X);
+ CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_512, buf, buf, blocks * 16,
+ &aes_key, iv, Htable, X);
+ CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_512, buf, buf,
+ blocks * 16 + 7, &aes_key, iv, Htable, X);
+ }
+ }
#endif // GHASH_ASM_X86_64
}
#endif // GHASH_ASM_X86 || GHASH_ASM_X86_64
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index 601dab7..4cedc39 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -126,6 +126,15 @@
// can be safely copied. Additionally, |gcm_key| is split into a separate
// struct.
+// gcm_impl_t specifies an assembly implementation of AES-GCM.
+enum gcm_impl_t {
+ gcm_separate = 0, // No combined AES-GCM, but may have AES-CTR and GHASH.
+ gcm_x86_aesni,
+ gcm_x86_vaes_avx10_256,
+ gcm_x86_vaes_avx10_512,
+ gcm_arm64_aes,
+};
+
typedef struct { uint64_t hi,lo; } u128;
// gmult_func multiplies |Xi| by the GCM key and writes the result back to
@@ -148,10 +157,7 @@
ghash_func ghash;
block128_f block;
-
- // use_hw_gcm_crypt is true if this context should use platform-specific
- // assembly to process GCM data.
- unsigned use_hw_gcm_crypt:1;
+ enum gcm_impl_t impl;
} GCM128_KEY;
// GCM128_CONTEXT contains state for a single GCM operation. The structure
@@ -182,11 +188,9 @@
// CRYPTO_ghash_init writes a precomputed table of powers of |gcm_key| to
// |out_table| and sets |*out_mult| and |*out_hash| to (potentially hardware
-// accelerated) functions for performing operations in the GHASH field. If the
-// AVX implementation was used |*out_is_avx| will be true.
+// accelerated) functions for performing operations in the GHASH field.
void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
- u128 out_table[16], int *out_is_avx,
- const uint8_t gcm_key[16]);
+ u128 out_table[16], const uint8_t gcm_key[16]);
// CRYPTO_gcm128_init_key initialises |gcm_key| to use |block| (typically AES)
// with the given key. |block_is_hwaes| is one if |block| is |aes_hw_encrypt|.
@@ -279,6 +283,30 @@
size_t aesni_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
const u128 Htable[16], uint8_t Xi[16]);
+
+void gcm_init_vpclmulqdq_avx10(u128 Htable[16], const uint64_t H[2]);
+void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_vpclmulqdq_avx10_256(uint8_t Xi[16], const u128 Htable[16],
+ const uint8_t *in, size_t len);
+void gcm_ghash_vpclmulqdq_avx10_512(uint8_t Xi[16], const u128 Htable[16],
+ const uint8_t *in, size_t len);
+void aes_gcm_enc_update_vaes_avx10_256(const uint8_t *in, uint8_t *out,
+ size_t len, const AES_KEY *key,
+ const uint8_t ivec[16],
+ const u128 Htable[16], uint8_t Xi[16]);
+void aes_gcm_dec_update_vaes_avx10_256(const uint8_t *in, uint8_t *out,
+ size_t len, const AES_KEY *key,
+ const uint8_t ivec[16],
+ const u128 Htable[16], uint8_t Xi[16]);
+void aes_gcm_enc_update_vaes_avx10_512(const uint8_t *in, uint8_t *out,
+ size_t len, const AES_KEY *key,
+ const uint8_t ivec[16],
+ const u128 Htable[16], uint8_t Xi[16]);
+void aes_gcm_dec_update_vaes_avx10_512(const uint8_t *in, uint8_t *out,
+ size_t len, const AES_KEY *key,
+ const uint8_t ivec[16],
+ const u128 Htable[16], uint8_t Xi[16]);
+
#endif // OPENSSL_X86_64
#if defined(OPENSSL_X86)
diff --git a/crypto/fipsmodule/modes/polyval.cc.inc b/crypto/fipsmodule/modes/polyval.cc.inc
index 4e53222..5e9e664 100644
--- a/crypto/fipsmodule/modes/polyval.cc.inc
+++ b/crypto/fipsmodule/modes/polyval.cc.inc
@@ -56,8 +56,7 @@
OPENSSL_memcpy(H, key, 16);
reverse_and_mulX_ghash(H);
- int is_avx;
- CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, ctx->Htable, &is_avx, H);
+ CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, ctx->Htable, H);
OPENSSL_memset(&ctx->S, 0, sizeof(ctx->S));
}
diff --git a/crypto/impl_dispatch_test.cc b/crypto/impl_dispatch_test.cc
index 631e78f..3eb6cfa 100644
--- a/crypto/impl_dispatch_test.cc
+++ b/crypto/impl_dispatch_test.cc
@@ -36,6 +36,10 @@
aesni_ = CRYPTO_is_AESNI_capable();
avx_movbe_ = CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable();
ssse3_ = CRYPTO_is_SSSE3_capable();
+ vaes_ = CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() &&
+ CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
+ CRYPTO_is_BMI2_capable();
+ avoid_zmm_ = CRYPTO_cpu_avoid_zmm_registers();
is_x86_64_ =
#if defined(OPENSSL_X86_64)
true;
@@ -75,6 +79,8 @@
bool avx_movbe_ = false;
bool ssse3_ = false;
bool is_x86_64_ = false;
+ bool vaes_ = false;
+ bool avoid_zmm_ = false;
#endif
};
@@ -87,16 +93,23 @@
constexpr size_t kFlag_aes_hw_set_encrypt_key = 3;
constexpr size_t kFlag_vpaes_encrypt = 4;
constexpr size_t kFlag_vpaes_set_encrypt_key = 5;
+constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_256 = 6;
+constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_512 = 7;
TEST_F(ImplDispatchTest, AEAD_AES_GCM) {
AssertFunctionsHit(
{
- {kFlag_aes_hw_ctr32_encrypt_blocks, aesni_},
+ {kFlag_aes_hw_ctr32_encrypt_blocks, aesni_ && !(is_x86_64_ && vaes_)},
{kFlag_aes_hw_encrypt, aesni_},
{kFlag_aes_hw_set_encrypt_key, aesni_},
- {kFlag_aesni_gcm_encrypt, is_x86_64_ && aesni_ && avx_movbe_},
+ {kFlag_aesni_gcm_encrypt,
+ is_x86_64_ && aesni_ && avx_movbe_ && !vaes_},
{kFlag_vpaes_encrypt, ssse3_ && !aesni_},
{kFlag_vpaes_set_encrypt_key, ssse3_ && !aesni_},
+ {kFlag_aes_gcm_enc_update_vaes_avx10_256,
+ is_x86_64_ && vaes_ && avoid_zmm_},
+ {kFlag_aes_gcm_enc_update_vaes_avx10_512,
+ is_x86_64_ && vaes_ && !avoid_zmm_},
},
[] {
const uint8_t kZeros[16] = {0};
diff --git a/crypto/internal.h b/crypto/internal.h
index 8c1900b..46decef 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -1537,7 +1537,9 @@
// 3: aes_hw_set_encrypt_key
// 4: vpaes_encrypt
// 5: vpaes_set_encrypt_key
-extern uint8_t BORINGSSL_function_hit[7];
+// 6: aes_gcm_enc_update_vaes_avx10_256
+// 7: aes_gcm_enc_update_vaes_avx10_512
+extern uint8_t BORINGSSL_function_hit[8];
#endif // BORINGSSL_DISPATCH_TEST
// OPENSSL_vasprintf_internal is just like |vasprintf(3)|. If |system_malloc| is
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-apple.S b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
new file mode 100644
index 0000000..b75bb07
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
@@ -0,0 +1,2264 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.section __DATA,__const
+.p2align 6
+
+
+L$bswap_mask:
+.quad 0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+L$gfpoly:
+.quad 1, 0xc200000000000000
+
+
+L$gfpoly_and_internal_carrybit:
+.quad 1, 0xc200000000000001
+
+
+
+
+
+L$ctr_pattern:
+.quad 0, 0
+.quad 1, 0
+L$inc_2blocks:
+.quad 2, 0
+.quad 3, 0
+L$inc_4blocks:
+.quad 4, 0
+
+.text
+.globl _gcm_gmult_vpclmulqdq_avx10
+.private_extern _gcm_gmult_vpclmulqdq_avx10
+
+.p2align 5
+_gcm_gmult_vpclmulqdq_avx10:
+
+
+_CET_ENDBR
+
+
+
+ vmovdqu (%rdi),%xmm0
+ vmovdqu L$bswap_mask(%rip),%xmm1
+ vmovdqu 256-16(%rsi),%xmm2
+ vmovdqu L$gfpoly(%rip),%xmm3
+ vpshufb %xmm1,%xmm0,%xmm0
+
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6
+ vpxord %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6
+ vpshufd $0x4e,%xmm4,%xmm4
+ vpternlogd $0x96,%xmm6,%xmm4,%xmm5
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4
+ vpshufd $0x4e,%xmm5,%xmm5
+ vpternlogd $0x96,%xmm4,%xmm5,%xmm0
+
+
+ vpshufb %xmm1,%xmm0,%xmm0
+ vmovdqu %xmm0,(%rdi)
+ ret
+
+
+
+.globl _gcm_init_vpclmulqdq_avx10
+.private_extern _gcm_init_vpclmulqdq_avx10
+
+.p2align 5
+_gcm_init_vpclmulqdq_avx10:
+
+
+_CET_ENDBR
+
+ leaq 256-32(%rdi),%r8
+
+
+
+ vpshufd $0x4e,(%rsi),%xmm3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vpshufd $0xd3,%xmm3,%xmm0
+ vpsrad $31,%xmm0,%xmm0
+ vpaddq %xmm3,%xmm3,%xmm3
+
+ vpternlogd $0x78,L$gfpoly_and_internal_carrybit(%rip),%xmm0,%xmm3
+
+
+ vbroadcasti32x4 L$gfpoly(%rip),%ymm5
+
+
+
+
+
+
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0
+ vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1
+ vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2
+ vpxord %xmm2,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2
+ vpshufd $0x4e,%xmm0,%xmm0
+ vpternlogd $0x96,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm3,%xmm4
+ vpclmulqdq $0x01,%xmm1,%xmm5,%xmm0
+ vpshufd $0x4e,%xmm1,%xmm1
+ vpternlogd $0x96,%xmm0,%xmm1,%xmm4
+
+
+
+ vinserti128 $1,%xmm3,%ymm4,%ymm3
+ vinserti128 $1,%xmm4,%ymm4,%ymm4
+
+ vmovdqu8 %ymm3,(%r8)
+
+
+
+
+
+ movl $7,%eax
+L$precompute_next__func1:
+ subq $32,%r8
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm0
+ vpclmulqdq $0x01,%ymm4,%ymm3,%ymm1
+ vpclmulqdq $0x10,%ymm4,%ymm3,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2
+ vpshufd $0x4e,%ymm0,%ymm0
+ vpternlogd $0x96,%ymm2,%ymm0,%ymm1
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm3
+ vpclmulqdq $0x01,%ymm1,%ymm5,%ymm0
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpternlogd $0x96,%ymm0,%ymm1,%ymm3
+
+ vmovdqu8 %ymm3,(%r8)
+ decl %eax
+ jnz L$precompute_next__func1
+
+ vzeroupper
+ ret
+
+
+
+.globl _gcm_ghash_vpclmulqdq_avx10_256
+.private_extern _gcm_ghash_vpclmulqdq_avx10_256
+
+.p2align 5
+_gcm_ghash_vpclmulqdq_avx10_256:
+
+
+_CET_ENDBR
+
+
+
+
+
+
+ vmovdqu L$bswap_mask(%rip),%xmm4
+ vmovdqu L$gfpoly(%rip),%xmm10
+
+
+ vmovdqu (%rdi),%xmm5
+ vpshufb %xmm4,%xmm5,%xmm5
+
+
+ cmpq $32,%rcx
+ jb L$aad_blockbyblock__func1
+
+
+
+ vshufi64x2 $0,%ymm4,%ymm4,%ymm4
+ vshufi64x2 $0,%ymm10,%ymm10,%ymm10
+
+
+ vmovdqu8 256-32(%rsi),%ymm9
+
+ cmpq $128-1,%rcx
+ jbe L$aad_loop_1x__func1
+
+
+ vmovdqu8 256-128(%rsi),%ymm6
+ vmovdqu8 256-96(%rsi),%ymm7
+ vmovdqu8 256-64(%rsi),%ymm8
+
+
+L$aad_loop_4x__func1:
+ vmovdqu8 0(%rdx),%ymm0
+ vmovdqu8 32(%rdx),%ymm1
+ vmovdqu8 64(%rdx),%ymm2
+ vmovdqu8 96(%rdx),%ymm3
+ vpshufb %ymm4,%ymm0,%ymm0
+ vpxord %ymm5,%ymm0,%ymm0
+ vpshufb %ymm4,%ymm1,%ymm1
+ vpshufb %ymm4,%ymm2,%ymm2
+ vpshufb %ymm4,%ymm3,%ymm3
+ vpclmulqdq $0x00,%ymm6,%ymm0,%ymm5
+ vpclmulqdq $0x00,%ymm7,%ymm1,%ymm11
+ vpclmulqdq $0x00,%ymm8,%ymm2,%ymm12
+ vpxord %ymm11,%ymm5,%ymm5
+ vpclmulqdq $0x00,%ymm9,%ymm3,%ymm13
+ vpternlogd $0x96,%ymm13,%ymm12,%ymm5
+ vpclmulqdq $0x01,%ymm6,%ymm0,%ymm11
+ vpclmulqdq $0x01,%ymm7,%ymm1,%ymm12
+ vpclmulqdq $0x01,%ymm8,%ymm2,%ymm13
+ vpternlogd $0x96,%ymm13,%ymm12,%ymm11
+ vpclmulqdq $0x01,%ymm9,%ymm3,%ymm12
+ vpclmulqdq $0x10,%ymm6,%ymm0,%ymm13
+ vpternlogd $0x96,%ymm13,%ymm12,%ymm11
+ vpclmulqdq $0x10,%ymm7,%ymm1,%ymm12
+ vpclmulqdq $0x10,%ymm8,%ymm2,%ymm13
+ vpternlogd $0x96,%ymm13,%ymm12,%ymm11
+ vpclmulqdq $0x01,%ymm5,%ymm10,%ymm13
+ vpclmulqdq $0x10,%ymm9,%ymm3,%ymm12
+ vpxord %ymm12,%ymm11,%ymm11
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm6,%ymm0,%ymm0
+ vpclmulqdq $0x11,%ymm7,%ymm1,%ymm1
+ vpclmulqdq $0x11,%ymm8,%ymm2,%ymm2
+ vpternlogd $0x96,%ymm13,%ymm5,%ymm11
+ vpclmulqdq $0x11,%ymm9,%ymm3,%ymm3
+ vpternlogd $0x96,%ymm2,%ymm1,%ymm0
+ vpclmulqdq $0x01,%ymm11,%ymm10,%ymm12
+ vpxord %ymm3,%ymm0,%ymm5
+ vpshufd $0x4e,%ymm11,%ymm11
+ vpternlogd $0x96,%ymm12,%ymm11,%ymm5
+ vextracti32x4 $1,%ymm5,%xmm0
+ vpxord %xmm0,%xmm5,%xmm5
+
+ subq $-128,%rdx
+ addq $-128,%rcx
+ cmpq $128-1,%rcx
+ ja L$aad_loop_4x__func1
+
+
+ cmpq $32,%rcx
+ jb L$aad_large_done__func1
+L$aad_loop_1x__func1:
+ vmovdqu8 (%rdx),%ymm0
+ vpshufb %ymm4,%ymm0,%ymm0
+ vpxord %ymm0,%ymm5,%ymm5
+ vpclmulqdq $0x00,%ymm9,%ymm5,%ymm0
+ vpclmulqdq $0x01,%ymm9,%ymm5,%ymm1
+ vpclmulqdq $0x10,%ymm9,%ymm5,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm10,%ymm2
+ vpshufd $0x4e,%ymm0,%ymm0
+ vpternlogd $0x96,%ymm2,%ymm0,%ymm1
+ vpclmulqdq $0x11,%ymm9,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm1,%ymm10,%ymm0
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpternlogd $0x96,%ymm0,%ymm1,%ymm5
+
+ vextracti32x4 $1,%ymm5,%xmm0
+ vpxord %xmm0,%xmm5,%xmm5
+
+ addq $32,%rdx
+ subq $32,%rcx
+ cmpq $32,%rcx
+ jae L$aad_loop_1x__func1
+
+L$aad_large_done__func1:
+
+
+ vzeroupper
+
+
+L$aad_blockbyblock__func1:
+ testq %rcx,%rcx
+ jz L$aad_done__func1
+ vmovdqu 256-16(%rsi),%xmm9
+L$aad_loop_blockbyblock__func1:
+ vmovdqu (%rdx),%xmm0
+ vpshufb %xmm4,%xmm0,%xmm0
+ vpxor %xmm0,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm9,%xmm5,%xmm0
+ vpclmulqdq $0x01,%xmm9,%xmm5,%xmm1
+ vpclmulqdq $0x10,%xmm9,%xmm5,%xmm2
+ vpxord %xmm2,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm0,%xmm10,%xmm2
+ vpshufd $0x4e,%xmm0,%xmm0
+ vpternlogd $0x96,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x11,%xmm9,%xmm5,%xmm5
+ vpclmulqdq $0x01,%xmm1,%xmm10,%xmm0
+ vpshufd $0x4e,%xmm1,%xmm1
+ vpternlogd $0x96,%xmm0,%xmm1,%xmm5
+
+ addq $16,%rdx
+ subq $16,%rcx
+ jnz L$aad_loop_blockbyblock__func1
+
+L$aad_done__func1:
+
+ vpshufb %xmm4,%xmm5,%xmm5
+ vmovdqu %xmm5,(%rdi)
+ ret
+
+
+
+.globl _aes_gcm_enc_update_vaes_avx10_256
+.private_extern _aes_gcm_enc_update_vaes_avx10_256
+
+.p2align 5
+_aes_gcm_enc_update_vaes_avx10_256:
+
+
+_CET_ENDBR
+ pushq %r12
+
+
+ movq 16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+
+ movb $1,_BORINGSSL_function_hit+6(%rip)
+#endif
+
+ vbroadcasti32x4 L$bswap_mask(%rip),%ymm8
+ vbroadcasti32x4 L$gfpoly(%rip),%ymm31
+
+
+
+ vmovdqu (%r12),%xmm10
+ vpshufb %xmm8,%xmm10,%xmm10
+ vbroadcasti32x4 (%r8),%ymm12
+ vpshufb %ymm8,%ymm12,%ymm12
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti32x4 (%rcx),%ymm13
+ vbroadcasti32x4 (%r11),%ymm14
+
+
+ vpaddd L$ctr_pattern(%rip),%ymm12,%ymm12
+
+
+ vbroadcasti32x4 L$inc_2blocks(%rip),%ymm11
+
+
+
+ cmpq $128-1,%rdx
+ jbe L$crypt_loop_4x_done__func1
+
+
+ vmovdqu8 256-128(%r9),%ymm27
+ vmovdqu8 256-96(%r9),%ymm28
+ vmovdqu8 256-64(%r9),%ymm29
+ vmovdqu8 256-32(%r9),%ymm30
+
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm1
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm2
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm3
+ vpaddd %ymm11,%ymm12,%ymm12
+
+
+ vpxord %ymm13,%ymm0,%ymm0
+ vpxord %ymm13,%ymm1,%ymm1
+ vpxord %ymm13,%ymm2,%ymm2
+ vpxord %ymm13,%ymm3,%ymm3
+
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_first_4_vecs__func1:
+ vbroadcasti32x4 (%rax),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_first_4_vecs__func1
+
+
+
+ vpxord 0(%rdi),%ymm14,%ymm4
+ vpxord 32(%rdi),%ymm14,%ymm5
+ vpxord 64(%rdi),%ymm14,%ymm6
+ vpxord 96(%rdi),%ymm14,%ymm7
+
+
+
+ vaesenclast %ymm4,%ymm0,%ymm4
+ vaesenclast %ymm5,%ymm1,%ymm5
+ vaesenclast %ymm6,%ymm2,%ymm6
+ vaesenclast %ymm7,%ymm3,%ymm7
+
+
+ vmovdqu8 %ymm4,0(%rsi)
+ vmovdqu8 %ymm5,32(%rsi)
+ vmovdqu8 %ymm6,64(%rsi)
+ vmovdqu8 %ymm7,96(%rsi)
+
+ subq $-128,%rdi
+ subq $-128,%rsi
+ addq $-128,%rdx
+ cmpq $128-1,%rdx
+ jbe L$ghash_last_ciphertext_4x__func1
+ vbroadcasti32x4 -144(%r11),%ymm15
+ vbroadcasti32x4 -128(%r11),%ymm16
+ vbroadcasti32x4 -112(%r11),%ymm17
+ vbroadcasti32x4 -96(%r11),%ymm18
+ vbroadcasti32x4 -80(%r11),%ymm19
+ vbroadcasti32x4 -64(%r11),%ymm20
+ vbroadcasti32x4 -48(%r11),%ymm21
+ vbroadcasti32x4 -32(%r11),%ymm22
+ vbroadcasti32x4 -16(%r11),%ymm23
+L$crypt_loop_4x__func1:
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm1
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm2
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm3
+ vpaddd %ymm11,%ymm12,%ymm12
+
+
+ vpxord %ymm13,%ymm0,%ymm0
+ vpxord %ymm13,%ymm1,%ymm1
+ vpxord %ymm13,%ymm2,%ymm2
+ vpxord %ymm13,%ymm3,%ymm3
+
+ cmpl $24,%r10d
+ jl L$aes128__func1
+ je L$aes192__func1
+
+ vbroadcasti32x4 -208(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+ vbroadcasti32x4 -192(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+L$aes192__func1:
+ vbroadcasti32x4 -176(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+ vbroadcasti32x4 -160(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+L$aes128__func1:
+ vpshufb %ymm8,%ymm4,%ymm4
+ vpxord %ymm10,%ymm4,%ymm4
+ vpshufb %ymm8,%ymm5,%ymm5
+ vpshufb %ymm8,%ymm6,%ymm6
+
+ vaesenc %ymm15,%ymm0,%ymm0
+ vaesenc %ymm15,%ymm1,%ymm1
+ vaesenc %ymm15,%ymm2,%ymm2
+ vaesenc %ymm15,%ymm3,%ymm3
+
+ vpshufb %ymm8,%ymm7,%ymm7
+ vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
+ vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
+ vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
+
+ vaesenc %ymm16,%ymm0,%ymm0
+ vaesenc %ymm16,%ymm1,%ymm1
+ vaesenc %ymm16,%ymm2,%ymm2
+ vaesenc %ymm16,%ymm3,%ymm3
+
+ vpxord %ymm24,%ymm10,%ymm10
+ vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm10
+ vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
+
+ vaesenc %ymm17,%ymm0,%ymm0
+ vaesenc %ymm17,%ymm1,%ymm1
+ vaesenc %ymm17,%ymm2,%ymm2
+ vaesenc %ymm17,%ymm3,%ymm3
+
+ vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
+
+ vaesenc %ymm18,%ymm0,%ymm0
+ vaesenc %ymm18,%ymm1,%ymm1
+ vaesenc %ymm18,%ymm2,%ymm2
+ vaesenc %ymm18,%ymm3,%ymm3
+
+ vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
+
+ vaesenc %ymm19,%ymm0,%ymm0
+ vaesenc %ymm19,%ymm1,%ymm1
+ vaesenc %ymm19,%ymm2,%ymm2
+ vaesenc %ymm19,%ymm3,%ymm3
+
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
+ vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
+ vpxord %ymm25,%ymm24,%ymm24
+
+ vaesenc %ymm20,%ymm0,%ymm0
+ vaesenc %ymm20,%ymm1,%ymm1
+ vaesenc %ymm20,%ymm2,%ymm2
+ vaesenc %ymm20,%ymm3,%ymm3
+
+ vpshufd $0x4e,%ymm10,%ymm10
+ vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
+ vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
+
+ vaesenc %ymm21,%ymm0,%ymm0
+ vaesenc %ymm21,%ymm1,%ymm1
+ vaesenc %ymm21,%ymm2,%ymm2
+ vaesenc %ymm21,%ymm3,%ymm3
+
+ vpternlogd $0x96,%ymm26,%ymm10,%ymm24
+ vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
+ vpternlogd $0x96,%ymm6,%ymm5,%ymm4
+ vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
+
+ vaesenc %ymm22,%ymm0,%ymm0
+ vaesenc %ymm22,%ymm1,%ymm1
+ vaesenc %ymm22,%ymm2,%ymm2
+ vaesenc %ymm22,%ymm3,%ymm3
+
+ vpxord %ymm7,%ymm4,%ymm10
+ vpshufd $0x4e,%ymm24,%ymm24
+ vpternlogd $0x96,%ymm25,%ymm24,%ymm10
+
+ vaesenc %ymm23,%ymm0,%ymm0
+ vaesenc %ymm23,%ymm1,%ymm1
+ vaesenc %ymm23,%ymm2,%ymm2
+ vaesenc %ymm23,%ymm3,%ymm3
+
+ vextracti32x4 $1,%ymm10,%xmm4
+ vpxord %xmm4,%xmm10,%xmm10
+
+
+
+
+ vpxord 0(%rdi),%ymm14,%ymm4
+ vpxord 32(%rdi),%ymm14,%ymm5
+ vpxord 64(%rdi),%ymm14,%ymm6
+ vpxord 96(%rdi),%ymm14,%ymm7
+
+
+
+ vaesenclast %ymm4,%ymm0,%ymm4
+ vaesenclast %ymm5,%ymm1,%ymm5
+ vaesenclast %ymm6,%ymm2,%ymm6
+ vaesenclast %ymm7,%ymm3,%ymm7
+
+
+ vmovdqu8 %ymm4,0(%rsi)
+ vmovdqu8 %ymm5,32(%rsi)
+ vmovdqu8 %ymm6,64(%rsi)
+ vmovdqu8 %ymm7,96(%rsi)
+
+ subq $-128,%rdi
+ subq $-128,%rsi
+ addq $-128,%rdx
+ cmpq $128-1,%rdx
+ ja L$crypt_loop_4x__func1
+L$ghash_last_ciphertext_4x__func1:
+ vpshufb %ymm8,%ymm4,%ymm4
+ vpxord %ymm10,%ymm4,%ymm4
+ vpshufb %ymm8,%ymm5,%ymm5
+ vpshufb %ymm8,%ymm6,%ymm6
+ vpshufb %ymm8,%ymm7,%ymm7
+ vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
+ vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
+ vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
+ vpxord %ymm24,%ymm10,%ymm10
+ vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm10
+ vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
+ vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
+ vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
+ vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
+ vpxord %ymm25,%ymm24,%ymm24
+ vpshufd $0x4e,%ymm10,%ymm10
+ vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
+ vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
+ vpternlogd $0x96,%ymm26,%ymm10,%ymm24
+ vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
+ vpternlogd $0x96,%ymm6,%ymm5,%ymm4
+ vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
+ vpxord %ymm7,%ymm4,%ymm10
+ vpshufd $0x4e,%ymm24,%ymm24
+ vpternlogd $0x96,%ymm25,%ymm24,%ymm10
+ vextracti32x4 $1,%ymm10,%xmm4
+ vpxord %xmm4,%xmm10,%xmm10
+
+L$crypt_loop_4x_done__func1:
+
+ testq %rdx,%rdx
+ jz L$done__func1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdx,%rax
+ negq %rax
+ andq $-16,%rax
+ leaq 256(%r9,%rax,1),%r8
+ vpxor %xmm4,%xmm4,%xmm4
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+
+ cmpq $32,%rdx
+ jb L$partial_vec__func1
+
+L$crypt_loop_1x__func1:
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpxord %ymm13,%ymm0,%ymm0
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_full_vec__func1:
+ vbroadcasti32x4 (%rax),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_full_vec__func1
+ vaesenclast %ymm14,%ymm0,%ymm0
+
+
+ vmovdqu8 (%rdi),%ymm1
+ vpxord %ymm1,%ymm0,%ymm0
+ vmovdqu8 %ymm0,(%rsi)
+
+
+ vmovdqu8 (%r8),%ymm30
+ vpshufb %ymm8,%ymm0,%ymm0
+ vpxord %ymm10,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
+ vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
+ vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
+ vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
+ vpxord %ymm7,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm2,%ymm1,%ymm5
+ vpxord %ymm3,%ymm6,%ymm6
+
+ vpxor %xmm10,%xmm10,%xmm10
+
+ addq $32,%r8
+ addq $32,%rdi
+ addq $32,%rsi
+ subq $32,%rdx
+ cmpq $32,%rdx
+ jae L$crypt_loop_1x__func1
+
+ testq %rdx,%rdx
+ jz L$reduce__func1
+
+L$partial_vec__func1:
+
+
+
+
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovd %eax,%k1
+ addq $15,%rdx
+ andq $-16,%rdx
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovd %eax,%k2
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpxord %ymm13,%ymm0,%ymm0
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_partialvec__func1:
+ vbroadcasti32x4 (%rax),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_partialvec__func1
+ vaesenclast %ymm14,%ymm0,%ymm0
+
+
+ vmovdqu8 (%rdi),%ymm1{%k1}{z}
+ vpxord %ymm1,%ymm0,%ymm0
+ vmovdqu8 %ymm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 (%r8),%ymm30{%k2}{z}
+ vmovdqu8 %ymm0,%ymm1{%k1}{z}
+ vpshufb %ymm8,%ymm1,%ymm0
+ vpxord %ymm10,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
+ vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
+ vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
+ vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
+ vpxord %ymm7,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm2,%ymm1,%ymm5
+ vpxord %ymm3,%ymm6,%ymm6
+
+
+L$reduce__func1:
+
+ vpclmulqdq $0x01,%ymm4,%ymm31,%ymm0
+ vpshufd $0x4e,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm0,%ymm4,%ymm5
+ vpclmulqdq $0x01,%ymm5,%ymm31,%ymm0
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpternlogd $0x96,%ymm0,%ymm5,%ymm6
+
+ vextracti32x4 $1,%ymm6,%xmm0
+ vpxord %xmm0,%xmm6,%xmm10
+
+
+L$done__func1:
+
+ vpshufb %xmm8,%xmm10,%xmm10
+ vmovdqu %xmm10,(%r12)
+
+ vzeroupper
+ popq %r12
+
+ ret
+
+
+
+.globl _aes_gcm_dec_update_vaes_avx10_256
+.private_extern _aes_gcm_dec_update_vaes_avx10_256
+
+.p2align 5
+_aes_gcm_dec_update_vaes_avx10_256:
+
+
+_CET_ENDBR
+ pushq %r12
+
+
+ movq 16(%rsp),%r12
+
+ vbroadcasti32x4 L$bswap_mask(%rip),%ymm8
+ vbroadcasti32x4 L$gfpoly(%rip),%ymm31
+
+
+
+ vmovdqu (%r12),%xmm10
+ vpshufb %xmm8,%xmm10,%xmm10
+ vbroadcasti32x4 (%r8),%ymm12
+ vpshufb %ymm8,%ymm12,%ymm12
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti32x4 (%rcx),%ymm13
+ vbroadcasti32x4 (%r11),%ymm14
+
+
+ vpaddd L$ctr_pattern(%rip),%ymm12,%ymm12
+
+
+ vbroadcasti32x4 L$inc_2blocks(%rip),%ymm11
+
+
+
+ cmpq $128-1,%rdx
+ jbe L$crypt_loop_4x_done__func2
+
+
+ vmovdqu8 256-128(%r9),%ymm27
+ vmovdqu8 256-96(%r9),%ymm28
+ vmovdqu8 256-64(%r9),%ymm29
+ vmovdqu8 256-32(%r9),%ymm30
+ vbroadcasti32x4 -144(%r11),%ymm15
+ vbroadcasti32x4 -128(%r11),%ymm16
+ vbroadcasti32x4 -112(%r11),%ymm17
+ vbroadcasti32x4 -96(%r11),%ymm18
+ vbroadcasti32x4 -80(%r11),%ymm19
+ vbroadcasti32x4 -64(%r11),%ymm20
+ vbroadcasti32x4 -48(%r11),%ymm21
+ vbroadcasti32x4 -32(%r11),%ymm22
+ vbroadcasti32x4 -16(%r11),%ymm23
+L$crypt_loop_4x__func2:
+ vmovdqu8 0(%rdi),%ymm4
+ vmovdqu8 32(%rdi),%ymm5
+ vmovdqu8 64(%rdi),%ymm6
+ vmovdqu8 96(%rdi),%ymm7
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm1
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm2
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm3
+ vpaddd %ymm11,%ymm12,%ymm12
+
+
+ vpxord %ymm13,%ymm0,%ymm0
+ vpxord %ymm13,%ymm1,%ymm1
+ vpxord %ymm13,%ymm2,%ymm2
+ vpxord %ymm13,%ymm3,%ymm3
+
+ cmpl $24,%r10d
+ jl L$aes128__func2
+ je L$aes192__func2
+
+ vbroadcasti32x4 -208(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+ vbroadcasti32x4 -192(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+L$aes192__func2:
+ vbroadcasti32x4 -176(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+ vbroadcasti32x4 -160(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+L$aes128__func2:
+ vpshufb %ymm8,%ymm4,%ymm4
+ vpxord %ymm10,%ymm4,%ymm4
+ vpshufb %ymm8,%ymm5,%ymm5
+ vpshufb %ymm8,%ymm6,%ymm6
+
+ vaesenc %ymm15,%ymm0,%ymm0
+ vaesenc %ymm15,%ymm1,%ymm1
+ vaesenc %ymm15,%ymm2,%ymm2
+ vaesenc %ymm15,%ymm3,%ymm3
+
+ vpshufb %ymm8,%ymm7,%ymm7
+ vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
+ vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
+ vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
+
+ vaesenc %ymm16,%ymm0,%ymm0
+ vaesenc %ymm16,%ymm1,%ymm1
+ vaesenc %ymm16,%ymm2,%ymm2
+ vaesenc %ymm16,%ymm3,%ymm3
+
+ vpxord %ymm24,%ymm10,%ymm10
+ vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm10
+ vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
+
+ vaesenc %ymm17,%ymm0,%ymm0
+ vaesenc %ymm17,%ymm1,%ymm1
+ vaesenc %ymm17,%ymm2,%ymm2
+ vaesenc %ymm17,%ymm3,%ymm3
+
+ vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
+
+ vaesenc %ymm18,%ymm0,%ymm0
+ vaesenc %ymm18,%ymm1,%ymm1
+ vaesenc %ymm18,%ymm2,%ymm2
+ vaesenc %ymm18,%ymm3,%ymm3
+
+ vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
+
+ vaesenc %ymm19,%ymm0,%ymm0
+ vaesenc %ymm19,%ymm1,%ymm1
+ vaesenc %ymm19,%ymm2,%ymm2
+ vaesenc %ymm19,%ymm3,%ymm3
+
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
+ vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
+ vpxord %ymm25,%ymm24,%ymm24
+
+ vaesenc %ymm20,%ymm0,%ymm0
+ vaesenc %ymm20,%ymm1,%ymm1
+ vaesenc %ymm20,%ymm2,%ymm2
+ vaesenc %ymm20,%ymm3,%ymm3
+
+ vpshufd $0x4e,%ymm10,%ymm10
+ vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
+ vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
+
+ vaesenc %ymm21,%ymm0,%ymm0
+ vaesenc %ymm21,%ymm1,%ymm1
+ vaesenc %ymm21,%ymm2,%ymm2
+ vaesenc %ymm21,%ymm3,%ymm3
+
+ vpternlogd $0x96,%ymm26,%ymm10,%ymm24
+ vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
+ vpternlogd $0x96,%ymm6,%ymm5,%ymm4
+ vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
+
+ vaesenc %ymm22,%ymm0,%ymm0
+ vaesenc %ymm22,%ymm1,%ymm1
+ vaesenc %ymm22,%ymm2,%ymm2
+ vaesenc %ymm22,%ymm3,%ymm3
+
+ vpxord %ymm7,%ymm4,%ymm10
+ vpshufd $0x4e,%ymm24,%ymm24
+ vpternlogd $0x96,%ymm25,%ymm24,%ymm10
+
+ vaesenc %ymm23,%ymm0,%ymm0
+ vaesenc %ymm23,%ymm1,%ymm1
+ vaesenc %ymm23,%ymm2,%ymm2
+ vaesenc %ymm23,%ymm3,%ymm3
+
+ vextracti32x4 $1,%ymm10,%xmm4
+ vpxord %xmm4,%xmm10,%xmm10
+
+
+
+
+ vpxord 0(%rdi),%ymm14,%ymm4
+ vpxord 32(%rdi),%ymm14,%ymm5
+ vpxord 64(%rdi),%ymm14,%ymm6
+ vpxord 96(%rdi),%ymm14,%ymm7
+
+
+
+ vaesenclast %ymm4,%ymm0,%ymm4
+ vaesenclast %ymm5,%ymm1,%ymm5
+ vaesenclast %ymm6,%ymm2,%ymm6
+ vaesenclast %ymm7,%ymm3,%ymm7
+
+
+ vmovdqu8 %ymm4,0(%rsi)
+ vmovdqu8 %ymm5,32(%rsi)
+ vmovdqu8 %ymm6,64(%rsi)
+ vmovdqu8 %ymm7,96(%rsi)
+
+ subq $-128,%rdi
+ subq $-128,%rsi
+ addq $-128,%rdx
+ cmpq $128-1,%rdx
+ ja L$crypt_loop_4x__func2
+L$crypt_loop_4x_done__func2:
+
+ testq %rdx,%rdx
+ jz L$done__func2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdx,%rax
+ negq %rax
+ andq $-16,%rax
+ leaq 256(%r9,%rax,1),%r8
+ vpxor %xmm4,%xmm4,%xmm4
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+
+ cmpq $32,%rdx
+ jb L$partial_vec__func2
+
+L$crypt_loop_1x__func2:
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpxord %ymm13,%ymm0,%ymm0
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_full_vec__func2:
+ vbroadcasti32x4 (%rax),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_full_vec__func2
+ vaesenclast %ymm14,%ymm0,%ymm0
+
+
+ vmovdqu8 (%rdi),%ymm1
+ vpxord %ymm1,%ymm0,%ymm0
+ vmovdqu8 %ymm0,(%rsi)
+
+
+ vmovdqu8 (%r8),%ymm30
+ vpshufb %ymm8,%ymm1,%ymm0
+ vpxord %ymm10,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
+ vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
+ vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
+ vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
+ vpxord %ymm7,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm2,%ymm1,%ymm5
+ vpxord %ymm3,%ymm6,%ymm6
+
+ vpxor %xmm10,%xmm10,%xmm10
+
+ addq $32,%r8
+ addq $32,%rdi
+ addq $32,%rsi
+ subq $32,%rdx
+ cmpq $32,%rdx
+ jae L$crypt_loop_1x__func2
+
+ testq %rdx,%rdx
+ jz L$reduce__func2
+
+L$partial_vec__func2:
+
+
+
+
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovd %eax,%k1
+ addq $15,%rdx
+ andq $-16,%rdx
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovd %eax,%k2
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpxord %ymm13,%ymm0,%ymm0
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_partialvec__func2:
+ vbroadcasti32x4 (%rax),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_partialvec__func2
+ vaesenclast %ymm14,%ymm0,%ymm0
+
+
+ vmovdqu8 (%rdi),%ymm1{%k1}{z}
+ vpxord %ymm1,%ymm0,%ymm0
+ vmovdqu8 %ymm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 (%r8),%ymm30{%k2}{z}
+
+ vpshufb %ymm8,%ymm1,%ymm0
+ vpxord %ymm10,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
+ vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
+ vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
+ vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
+ vpxord %ymm7,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm2,%ymm1,%ymm5
+ vpxord %ymm3,%ymm6,%ymm6
+
+
+L$reduce__func2:
+
+ vpclmulqdq $0x01,%ymm4,%ymm31,%ymm0
+ vpshufd $0x4e,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm0,%ymm4,%ymm5
+ vpclmulqdq $0x01,%ymm5,%ymm31,%ymm0
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpternlogd $0x96,%ymm0,%ymm5,%ymm6
+
+ vextracti32x4 $1,%ymm6,%xmm0
+ vpxord %xmm0,%xmm6,%xmm10
+
+
+L$done__func2:
+
+ vpshufb %xmm8,%xmm10,%xmm10
+ vmovdqu %xmm10,(%r12)
+
+ vzeroupper
+ popq %r12
+
+ ret
+
+
+
+.globl _gcm_ghash_vpclmulqdq_avx10_512
+.private_extern _gcm_ghash_vpclmulqdq_avx10_512
+
+.p2align 5
+_gcm_ghash_vpclmulqdq_avx10_512:
+
+
+_CET_ENDBR
+
+
+
+
+
+
+ vmovdqu L$bswap_mask(%rip),%xmm4
+ vmovdqu L$gfpoly(%rip),%xmm10
+
+
+ vmovdqu (%rdi),%xmm5
+ vpshufb %xmm4,%xmm5,%xmm5
+
+
+ cmpq $64,%rcx
+ jb L$aad_blockbyblock__func2
+
+
+
+ vshufi64x2 $0,%zmm4,%zmm4,%zmm4
+ vshufi64x2 $0,%zmm10,%zmm10,%zmm10
+
+
+ vmovdqu8 256-64(%rsi),%zmm9
+
+ cmpq $256-1,%rcx
+ jbe L$aad_loop_1x__func2
+
+
+ vmovdqu8 256-256(%rsi),%zmm6
+ vmovdqu8 256-192(%rsi),%zmm7
+ vmovdqu8 256-128(%rsi),%zmm8
+
+
+L$aad_loop_4x__func2:
+ vmovdqu8 0(%rdx),%zmm0
+ vmovdqu8 64(%rdx),%zmm1
+ vmovdqu8 128(%rdx),%zmm2
+ vmovdqu8 192(%rdx),%zmm3
+ vpshufb %zmm4,%zmm0,%zmm0
+ vpxord %zmm5,%zmm0,%zmm0
+ vpshufb %zmm4,%zmm1,%zmm1
+ vpshufb %zmm4,%zmm2,%zmm2
+ vpshufb %zmm4,%zmm3,%zmm3
+ vpclmulqdq $0x00,%zmm6,%zmm0,%zmm5
+ vpclmulqdq $0x00,%zmm7,%zmm1,%zmm11
+ vpclmulqdq $0x00,%zmm8,%zmm2,%zmm12
+ vpxord %zmm11,%zmm5,%zmm5
+ vpclmulqdq $0x00,%zmm9,%zmm3,%zmm13
+ vpternlogd $0x96,%zmm13,%zmm12,%zmm5
+ vpclmulqdq $0x01,%zmm6,%zmm0,%zmm11
+ vpclmulqdq $0x01,%zmm7,%zmm1,%zmm12
+ vpclmulqdq $0x01,%zmm8,%zmm2,%zmm13
+ vpternlogd $0x96,%zmm13,%zmm12,%zmm11
+ vpclmulqdq $0x01,%zmm9,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm6,%zmm0,%zmm13
+ vpternlogd $0x96,%zmm13,%zmm12,%zmm11
+ vpclmulqdq $0x10,%zmm7,%zmm1,%zmm12
+ vpclmulqdq $0x10,%zmm8,%zmm2,%zmm13
+ vpternlogd $0x96,%zmm13,%zmm12,%zmm11
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm13
+ vpclmulqdq $0x10,%zmm9,%zmm3,%zmm12
+ vpxord %zmm12,%zmm11,%zmm11
+ vpshufd $0x4e,%zmm5,%zmm5
+ vpclmulqdq $0x11,%zmm6,%zmm0,%zmm0
+ vpclmulqdq $0x11,%zmm7,%zmm1,%zmm1
+ vpclmulqdq $0x11,%zmm8,%zmm2,%zmm2
+ vpternlogd $0x96,%zmm13,%zmm5,%zmm11
+ vpclmulqdq $0x11,%zmm9,%zmm3,%zmm3
+ vpternlogd $0x96,%zmm2,%zmm1,%zmm0
+ vpclmulqdq $0x01,%zmm11,%zmm10,%zmm12
+ vpxord %zmm3,%zmm0,%zmm5
+ vpshufd $0x4e,%zmm11,%zmm11
+ vpternlogd $0x96,%zmm12,%zmm11,%zmm5
+ vextracti32x4 $1,%zmm5,%xmm0
+ vextracti32x4 $2,%zmm5,%xmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vpxord %xmm0,%xmm5,%xmm5
+ vpternlogd $0x96,%xmm1,%xmm2,%xmm5
+
+ subq $-256,%rdx
+ addq $-256,%rcx
+ cmpq $256-1,%rcx
+ ja L$aad_loop_4x__func2
+
+
+ cmpq $64,%rcx
+ jb L$aad_large_done__func2
+L$aad_loop_1x__func2:
+ vmovdqu8 (%rdx),%zmm0
+ vpshufb %zmm4,%zmm0,%zmm0
+ vpxord %zmm0,%zmm5,%zmm5
+ vpclmulqdq $0x00,%zmm9,%zmm5,%zmm0
+ vpclmulqdq $0x01,%zmm9,%zmm5,%zmm1
+ vpclmulqdq $0x10,%zmm9,%zmm5,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm0,%zmm10,%zmm2
+ vpshufd $0x4e,%zmm0,%zmm0
+ vpternlogd $0x96,%zmm2,%zmm0,%zmm1
+ vpclmulqdq $0x11,%zmm9,%zmm5,%zmm5
+ vpclmulqdq $0x01,%zmm1,%zmm10,%zmm0
+ vpshufd $0x4e,%zmm1,%zmm1
+ vpternlogd $0x96,%zmm0,%zmm1,%zmm5
+
+ vextracti32x4 $1,%zmm5,%xmm0
+ vextracti32x4 $2,%zmm5,%xmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vpxord %xmm0,%xmm5,%xmm5
+ vpternlogd $0x96,%xmm1,%xmm2,%xmm5
+
+ addq $64,%rdx
+ subq $64,%rcx
+ cmpq $64,%rcx
+ jae L$aad_loop_1x__func2
+
+L$aad_large_done__func2:
+
+
+ vzeroupper
+
+
+L$aad_blockbyblock__func2:
+ testq %rcx,%rcx
+ jz L$aad_done__func2
+ vmovdqu 256-16(%rsi),%xmm9
+L$aad_loop_blockbyblock__func2:
+ vmovdqu (%rdx),%xmm0
+ vpshufb %xmm4,%xmm0,%xmm0
+ vpxor %xmm0,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm9,%xmm5,%xmm0
+ vpclmulqdq $0x01,%xmm9,%xmm5,%xmm1
+ vpclmulqdq $0x10,%xmm9,%xmm5,%xmm2
+ vpxord %xmm2,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm0,%xmm10,%xmm2
+ vpshufd $0x4e,%xmm0,%xmm0
+ vpternlogd $0x96,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x11,%xmm9,%xmm5,%xmm5
+ vpclmulqdq $0x01,%xmm1,%xmm10,%xmm0
+ vpshufd $0x4e,%xmm1,%xmm1
+ vpternlogd $0x96,%xmm0,%xmm1,%xmm5
+
+ addq $16,%rdx
+ subq $16,%rcx
+ jnz L$aad_loop_blockbyblock__func2
+
+L$aad_done__func2:
+
+ vpshufb %xmm4,%xmm5,%xmm5
+ vmovdqu %xmm5,(%rdi)
+ ret
+
+
+
+.globl _aes_gcm_enc_update_vaes_avx10_512
+.private_extern _aes_gcm_enc_update_vaes_avx10_512
+
+.p2align 5
+_aes_gcm_enc_update_vaes_avx10_512:
+
+
+_CET_ENDBR
+ pushq %r12
+
+
+ movq 16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+
+ movb $1,_BORINGSSL_function_hit+7(%rip)
+#endif
+
+ vbroadcasti32x4 L$bswap_mask(%rip),%zmm8
+ vbroadcasti32x4 L$gfpoly(%rip),%zmm31
+
+
+
+ vmovdqu (%r12),%xmm10
+ vpshufb %xmm8,%xmm10,%xmm10
+ vbroadcasti32x4 (%r8),%zmm12
+ vpshufb %zmm8,%zmm12,%zmm12
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti32x4 (%rcx),%zmm13
+ vbroadcasti32x4 (%r11),%zmm14
+
+
+ vpaddd L$ctr_pattern(%rip),%zmm12,%zmm12
+
+
+ vbroadcasti32x4 L$inc_4blocks(%rip),%zmm11
+
+
+
+ cmpq $256-1,%rdx
+ jbe L$crypt_loop_4x_done__func3
+
+
+ vmovdqu8 256-256(%r9),%zmm27
+ vmovdqu8 256-192(%r9),%zmm28
+ vmovdqu8 256-128(%r9),%zmm29
+ vmovdqu8 256-64(%r9),%zmm30
+
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm1
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm2
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm3
+ vpaddd %zmm11,%zmm12,%zmm12
+
+
+ vpxord %zmm13,%zmm0,%zmm0
+ vpxord %zmm13,%zmm1,%zmm1
+ vpxord %zmm13,%zmm2,%zmm2
+ vpxord %zmm13,%zmm3,%zmm3
+
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_first_4_vecs__func3:
+ vbroadcasti32x4 (%rax),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_first_4_vecs__func3
+
+
+
+ vpxord 0(%rdi),%zmm14,%zmm4
+ vpxord 64(%rdi),%zmm14,%zmm5
+ vpxord 128(%rdi),%zmm14,%zmm6
+ vpxord 192(%rdi),%zmm14,%zmm7
+
+
+
+ vaesenclast %zmm4,%zmm0,%zmm4
+ vaesenclast %zmm5,%zmm1,%zmm5
+ vaesenclast %zmm6,%zmm2,%zmm6
+ vaesenclast %zmm7,%zmm3,%zmm7
+
+
+ vmovdqu8 %zmm4,0(%rsi)
+ vmovdqu8 %zmm5,64(%rsi)
+ vmovdqu8 %zmm6,128(%rsi)
+ vmovdqu8 %zmm7,192(%rsi)
+
+ subq $-256,%rdi
+ subq $-256,%rsi
+ addq $-256,%rdx
+ cmpq $256-1,%rdx
+ jbe L$ghash_last_ciphertext_4x__func3
+ vbroadcasti32x4 -144(%r11),%zmm15
+ vbroadcasti32x4 -128(%r11),%zmm16
+ vbroadcasti32x4 -112(%r11),%zmm17
+ vbroadcasti32x4 -96(%r11),%zmm18
+ vbroadcasti32x4 -80(%r11),%zmm19
+ vbroadcasti32x4 -64(%r11),%zmm20
+ vbroadcasti32x4 -48(%r11),%zmm21
+ vbroadcasti32x4 -32(%r11),%zmm22
+ vbroadcasti32x4 -16(%r11),%zmm23
+L$crypt_loop_4x__func3:
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm1
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm2
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm3
+ vpaddd %zmm11,%zmm12,%zmm12
+
+
+ vpxord %zmm13,%zmm0,%zmm0
+ vpxord %zmm13,%zmm1,%zmm1
+ vpxord %zmm13,%zmm2,%zmm2
+ vpxord %zmm13,%zmm3,%zmm3
+
+ cmpl $24,%r10d
+ jl L$aes128__func3
+ je L$aes192__func3
+
+ vbroadcasti32x4 -208(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+ vbroadcasti32x4 -192(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+L$aes192__func3:
+ vbroadcasti32x4 -176(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+ vbroadcasti32x4 -160(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+L$aes128__func3:
+ vpshufb %zmm8,%zmm4,%zmm4
+ vpxord %zmm10,%zmm4,%zmm4
+ vpshufb %zmm8,%zmm5,%zmm5
+ vpshufb %zmm8,%zmm6,%zmm6
+
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm1,%zmm1
+ vaesenc %zmm15,%zmm2,%zmm2
+ vaesenc %zmm15,%zmm3,%zmm3
+
+ vpshufb %zmm8,%zmm7,%zmm7
+ vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10
+ vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24
+ vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25
+
+ vaesenc %zmm16,%zmm0,%zmm0
+ vaesenc %zmm16,%zmm1,%zmm1
+ vaesenc %zmm16,%zmm2,%zmm2
+ vaesenc %zmm16,%zmm3,%zmm3
+
+ vpxord %zmm24,%zmm10,%zmm10
+ vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm10
+ vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24
+
+ vaesenc %zmm17,%zmm0,%zmm0
+ vaesenc %zmm17,%zmm1,%zmm1
+ vaesenc %zmm17,%zmm2,%zmm2
+ vaesenc %zmm17,%zmm3,%zmm3
+
+ vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25
+
+ vaesenc %zmm18,%zmm0,%zmm0
+ vaesenc %zmm18,%zmm1,%zmm1
+ vaesenc %zmm18,%zmm2,%zmm2
+ vaesenc %zmm18,%zmm3,%zmm3
+
+ vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26
+
+ vaesenc %zmm19,%zmm0,%zmm0
+ vaesenc %zmm19,%zmm1,%zmm1
+ vaesenc %zmm19,%zmm2,%zmm2
+ vaesenc %zmm19,%zmm3,%zmm3
+
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26
+ vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25
+ vpxord %zmm25,%zmm24,%zmm24
+
+ vaesenc %zmm20,%zmm0,%zmm0
+ vaesenc %zmm20,%zmm1,%zmm1
+ vaesenc %zmm20,%zmm2,%zmm2
+ vaesenc %zmm20,%zmm3,%zmm3
+
+ vpshufd $0x4e,%zmm10,%zmm10
+ vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4
+ vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5
+ vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6
+
+ vaesenc %zmm21,%zmm0,%zmm0
+ vaesenc %zmm21,%zmm1,%zmm1
+ vaesenc %zmm21,%zmm2,%zmm2
+ vaesenc %zmm21,%zmm3,%zmm3
+
+ vpternlogd $0x96,%zmm26,%zmm10,%zmm24
+ vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7
+ vpternlogd $0x96,%zmm6,%zmm5,%zmm4
+ vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25
+
+ vaesenc %zmm22,%zmm0,%zmm0
+ vaesenc %zmm22,%zmm1,%zmm1
+ vaesenc %zmm22,%zmm2,%zmm2
+ vaesenc %zmm22,%zmm3,%zmm3
+
+ vpxord %zmm7,%zmm4,%zmm10
+ vpshufd $0x4e,%zmm24,%zmm24
+ vpternlogd $0x96,%zmm25,%zmm24,%zmm10
+
+ vaesenc %zmm23,%zmm0,%zmm0
+ vaesenc %zmm23,%zmm1,%zmm1
+ vaesenc %zmm23,%zmm2,%zmm2
+ vaesenc %zmm23,%zmm3,%zmm3
+
+ vextracti32x4 $1,%zmm10,%xmm4
+ vextracti32x4 $2,%zmm10,%xmm5
+ vextracti32x4 $3,%zmm10,%xmm6
+ vpxord %xmm4,%xmm10,%xmm10
+ vpternlogd $0x96,%xmm5,%xmm6,%xmm10
+
+
+
+
+ vpxord 0(%rdi),%zmm14,%zmm4
+ vpxord 64(%rdi),%zmm14,%zmm5
+ vpxord 128(%rdi),%zmm14,%zmm6
+ vpxord 192(%rdi),%zmm14,%zmm7
+
+
+
+ vaesenclast %zmm4,%zmm0,%zmm4
+ vaesenclast %zmm5,%zmm1,%zmm5
+ vaesenclast %zmm6,%zmm2,%zmm6
+ vaesenclast %zmm7,%zmm3,%zmm7
+
+
+ vmovdqu8 %zmm4,0(%rsi)
+ vmovdqu8 %zmm5,64(%rsi)
+ vmovdqu8 %zmm6,128(%rsi)
+ vmovdqu8 %zmm7,192(%rsi)
+
+ subq $-256,%rdi
+ subq $-256,%rsi
+ addq $-256,%rdx
+ cmpq $256-1,%rdx
+ ja L$crypt_loop_4x__func3
+L$ghash_last_ciphertext_4x__func3:
+ vpshufb %zmm8,%zmm4,%zmm4
+ vpxord %zmm10,%zmm4,%zmm4
+ vpshufb %zmm8,%zmm5,%zmm5
+ vpshufb %zmm8,%zmm6,%zmm6
+ vpshufb %zmm8,%zmm7,%zmm7
+ vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10
+ vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24
+ vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25
+ vpxord %zmm24,%zmm10,%zmm10
+ vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm10
+ vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24
+ vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25
+ vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26
+ vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25
+ vpxord %zmm25,%zmm24,%zmm24
+ vpshufd $0x4e,%zmm10,%zmm10
+ vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4
+ vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5
+ vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6
+ vpternlogd $0x96,%zmm26,%zmm10,%zmm24
+ vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7
+ vpternlogd $0x96,%zmm6,%zmm5,%zmm4
+ vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25
+ vpxord %zmm7,%zmm4,%zmm10
+ vpshufd $0x4e,%zmm24,%zmm24
+ vpternlogd $0x96,%zmm25,%zmm24,%zmm10
+ vextracti32x4 $1,%zmm10,%xmm4
+ vextracti32x4 $2,%zmm10,%xmm5
+ vextracti32x4 $3,%zmm10,%xmm6
+ vpxord %xmm4,%xmm10,%xmm10
+ vpternlogd $0x96,%xmm5,%xmm6,%xmm10
+
+L$crypt_loop_4x_done__func3:
+
+ testq %rdx,%rdx
+ jz L$done__func3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdx,%rax
+ negq %rax
+ andq $-16,%rax
+ leaq 256(%r9,%rax,1),%r8
+ vpxor %xmm4,%xmm4,%xmm4
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+
+ cmpq $64,%rdx
+ jb L$partial_vec__func3
+
+L$crypt_loop_1x__func3:
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpxord %zmm13,%zmm0,%zmm0
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_full_vec__func3:
+ vbroadcasti32x4 (%rax),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_full_vec__func3
+ vaesenclast %zmm14,%zmm0,%zmm0
+
+
+ vmovdqu8 (%rdi),%zmm1
+ vpxord %zmm1,%zmm0,%zmm0
+ vmovdqu8 %zmm0,(%rsi)
+
+
+ vmovdqu8 (%r8),%zmm30
+ vpshufb %zmm8,%zmm0,%zmm0
+ vpxord %zmm10,%zmm0,%zmm0
+ vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
+ vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
+ vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
+ vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
+ vpxord %zmm7,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm2,%zmm1,%zmm5
+ vpxord %zmm3,%zmm6,%zmm6
+
+ vpxor %xmm10,%xmm10,%xmm10
+
+ addq $64,%r8
+ addq $64,%rdi
+ addq $64,%rsi
+ subq $64,%rdx
+ cmpq $64,%rdx
+ jae L$crypt_loop_1x__func3
+
+ testq %rdx,%rdx
+ jz L$reduce__func3
+
+L$partial_vec__func3:
+
+
+
+
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovq %rax,%k1
+ addq $15,%rdx
+ andq $-16,%rdx
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovq %rax,%k2
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpxord %zmm13,%zmm0,%zmm0
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_partialvec__func3:
+ vbroadcasti32x4 (%rax),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_partialvec__func3
+ vaesenclast %zmm14,%zmm0,%zmm0
+
+
+ vmovdqu8 (%rdi),%zmm1{%k1}{z}
+ vpxord %zmm1,%zmm0,%zmm0
+ vmovdqu8 %zmm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 (%r8),%zmm30{%k2}{z}
+ vmovdqu8 %zmm0,%zmm1{%k1}{z}
+ vpshufb %zmm8,%zmm1,%zmm0
+ vpxord %zmm10,%zmm0,%zmm0
+ vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
+ vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
+ vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
+ vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
+ vpxord %zmm7,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm2,%zmm1,%zmm5
+ vpxord %zmm3,%zmm6,%zmm6
+
+
+L$reduce__func3:
+
+ vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0
+ vpshufd $0x4e,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm0,%zmm4,%zmm5
+ vpclmulqdq $0x01,%zmm5,%zmm31,%zmm0
+ vpshufd $0x4e,%zmm5,%zmm5
+ vpternlogd $0x96,%zmm0,%zmm5,%zmm6
+
+ vextracti32x4 $1,%zmm6,%xmm0
+ vextracti32x4 $2,%zmm6,%xmm1
+ vextracti32x4 $3,%zmm6,%xmm2
+ vpxord %xmm0,%xmm6,%xmm10
+ vpternlogd $0x96,%xmm1,%xmm2,%xmm10
+
+
+L$done__func3:
+
+ vpshufb %xmm8,%xmm10,%xmm10
+ vmovdqu %xmm10,(%r12)
+
+ vzeroupper
+ popq %r12
+
+ ret
+
+
+
+.globl _aes_gcm_dec_update_vaes_avx10_512
+.private_extern _aes_gcm_dec_update_vaes_avx10_512
+
+.p2align 5
+_aes_gcm_dec_update_vaes_avx10_512:
+
+
+_CET_ENDBR
+ pushq %r12
+
+
+ movq 16(%rsp),%r12
+
+ vbroadcasti32x4 L$bswap_mask(%rip),%zmm8
+ vbroadcasti32x4 L$gfpoly(%rip),%zmm31
+
+
+
+ vmovdqu (%r12),%xmm10
+ vpshufb %xmm8,%xmm10,%xmm10
+ vbroadcasti32x4 (%r8),%zmm12
+ vpshufb %zmm8,%zmm12,%zmm12
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti32x4 (%rcx),%zmm13
+ vbroadcasti32x4 (%r11),%zmm14
+
+
+ vpaddd L$ctr_pattern(%rip),%zmm12,%zmm12
+
+
+ vbroadcasti32x4 L$inc_4blocks(%rip),%zmm11
+
+
+
+ cmpq $256-1,%rdx
+ jbe L$crypt_loop_4x_done__func4
+
+
+ vmovdqu8 256-256(%r9),%zmm27
+ vmovdqu8 256-192(%r9),%zmm28
+ vmovdqu8 256-128(%r9),%zmm29
+ vmovdqu8 256-64(%r9),%zmm30
+ vbroadcasti32x4 -144(%r11),%zmm15
+ vbroadcasti32x4 -128(%r11),%zmm16
+ vbroadcasti32x4 -112(%r11),%zmm17
+ vbroadcasti32x4 -96(%r11),%zmm18
+ vbroadcasti32x4 -80(%r11),%zmm19
+ vbroadcasti32x4 -64(%r11),%zmm20
+ vbroadcasti32x4 -48(%r11),%zmm21
+ vbroadcasti32x4 -32(%r11),%zmm22
+ vbroadcasti32x4 -16(%r11),%zmm23
+L$crypt_loop_4x__func4:
+ vmovdqu8 0(%rdi),%zmm4
+ vmovdqu8 64(%rdi),%zmm5
+ vmovdqu8 128(%rdi),%zmm6
+ vmovdqu8 192(%rdi),%zmm7
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm1
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm2
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm3
+ vpaddd %zmm11,%zmm12,%zmm12
+
+
+ vpxord %zmm13,%zmm0,%zmm0
+ vpxord %zmm13,%zmm1,%zmm1
+ vpxord %zmm13,%zmm2,%zmm2
+ vpxord %zmm13,%zmm3,%zmm3
+
+ cmpl $24,%r10d
+ jl L$aes128__func4
+ je L$aes192__func4
+
+ vbroadcasti32x4 -208(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+ vbroadcasti32x4 -192(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+L$aes192__func4:
+ vbroadcasti32x4 -176(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+ vbroadcasti32x4 -160(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+L$aes128__func4:
+ vpshufb %zmm8,%zmm4,%zmm4
+ vpxord %zmm10,%zmm4,%zmm4
+ vpshufb %zmm8,%zmm5,%zmm5
+ vpshufb %zmm8,%zmm6,%zmm6
+
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm1,%zmm1
+ vaesenc %zmm15,%zmm2,%zmm2
+ vaesenc %zmm15,%zmm3,%zmm3
+
+ vpshufb %zmm8,%zmm7,%zmm7
+ vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10
+ vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24
+ vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25
+
+ vaesenc %zmm16,%zmm0,%zmm0
+ vaesenc %zmm16,%zmm1,%zmm1
+ vaesenc %zmm16,%zmm2,%zmm2
+ vaesenc %zmm16,%zmm3,%zmm3
+
+ vpxord %zmm24,%zmm10,%zmm10
+ vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm10
+ vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24
+
+ vaesenc %zmm17,%zmm0,%zmm0
+ vaesenc %zmm17,%zmm1,%zmm1
+ vaesenc %zmm17,%zmm2,%zmm2
+ vaesenc %zmm17,%zmm3,%zmm3
+
+ vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25
+
+ vaesenc %zmm18,%zmm0,%zmm0
+ vaesenc %zmm18,%zmm1,%zmm1
+ vaesenc %zmm18,%zmm2,%zmm2
+ vaesenc %zmm18,%zmm3,%zmm3
+
+ vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26
+
+ vaesenc %zmm19,%zmm0,%zmm0
+ vaesenc %zmm19,%zmm1,%zmm1
+ vaesenc %zmm19,%zmm2,%zmm2
+ vaesenc %zmm19,%zmm3,%zmm3
+
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26
+ vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25
+ vpxord %zmm25,%zmm24,%zmm24
+
+ vaesenc %zmm20,%zmm0,%zmm0
+ vaesenc %zmm20,%zmm1,%zmm1
+ vaesenc %zmm20,%zmm2,%zmm2
+ vaesenc %zmm20,%zmm3,%zmm3
+
+ vpshufd $0x4e,%zmm10,%zmm10
+ vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4
+ vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5
+ vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6
+
+ vaesenc %zmm21,%zmm0,%zmm0
+ vaesenc %zmm21,%zmm1,%zmm1
+ vaesenc %zmm21,%zmm2,%zmm2
+ vaesenc %zmm21,%zmm3,%zmm3
+
+ vpternlogd $0x96,%zmm26,%zmm10,%zmm24
+ vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7
+ vpternlogd $0x96,%zmm6,%zmm5,%zmm4
+ vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25
+
+ vaesenc %zmm22,%zmm0,%zmm0
+ vaesenc %zmm22,%zmm1,%zmm1
+ vaesenc %zmm22,%zmm2,%zmm2
+ vaesenc %zmm22,%zmm3,%zmm3
+
+ vpxord %zmm7,%zmm4,%zmm10
+ vpshufd $0x4e,%zmm24,%zmm24
+ vpternlogd $0x96,%zmm25,%zmm24,%zmm10
+
+ vaesenc %zmm23,%zmm0,%zmm0
+ vaesenc %zmm23,%zmm1,%zmm1
+ vaesenc %zmm23,%zmm2,%zmm2
+ vaesenc %zmm23,%zmm3,%zmm3
+
+ vextracti32x4 $1,%zmm10,%xmm4
+ vextracti32x4 $2,%zmm10,%xmm5
+ vextracti32x4 $3,%zmm10,%xmm6
+ vpxord %xmm4,%xmm10,%xmm10
+ vpternlogd $0x96,%xmm5,%xmm6,%xmm10
+
+
+
+
+ vpxord 0(%rdi),%zmm14,%zmm4
+ vpxord 64(%rdi),%zmm14,%zmm5
+ vpxord 128(%rdi),%zmm14,%zmm6
+ vpxord 192(%rdi),%zmm14,%zmm7
+
+
+
+ vaesenclast %zmm4,%zmm0,%zmm4
+ vaesenclast %zmm5,%zmm1,%zmm5
+ vaesenclast %zmm6,%zmm2,%zmm6
+ vaesenclast %zmm7,%zmm3,%zmm7
+
+
+ vmovdqu8 %zmm4,0(%rsi)
+ vmovdqu8 %zmm5,64(%rsi)
+ vmovdqu8 %zmm6,128(%rsi)
+ vmovdqu8 %zmm7,192(%rsi)
+
+ subq $-256,%rdi
+ subq $-256,%rsi
+ addq $-256,%rdx
+ cmpq $256-1,%rdx
+ ja L$crypt_loop_4x__func4
+L$crypt_loop_4x_done__func4:
+
+ testq %rdx,%rdx
+ jz L$done__func4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdx,%rax
+ negq %rax
+ andq $-16,%rax
+ leaq 256(%r9,%rax,1),%r8
+ vpxor %xmm4,%xmm4,%xmm4
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+
+ cmpq $64,%rdx
+ jb L$partial_vec__func4
+
+L$crypt_loop_1x__func4:
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpxord %zmm13,%zmm0,%zmm0
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_full_vec__func4:
+ vbroadcasti32x4 (%rax),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_full_vec__func4
+ vaesenclast %zmm14,%zmm0,%zmm0
+
+
+ vmovdqu8 (%rdi),%zmm1
+ vpxord %zmm1,%zmm0,%zmm0
+ vmovdqu8 %zmm0,(%rsi)
+
+
+ vmovdqu8 (%r8),%zmm30
+ vpshufb %zmm8,%zmm1,%zmm0
+ vpxord %zmm10,%zmm0,%zmm0
+ vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
+ vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
+ vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
+ vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
+ vpxord %zmm7,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm2,%zmm1,%zmm5
+ vpxord %zmm3,%zmm6,%zmm6
+
+ vpxor %xmm10,%xmm10,%xmm10
+
+ addq $64,%r8
+ addq $64,%rdi
+ addq $64,%rsi
+ subq $64,%rdx
+ cmpq $64,%rdx
+ jae L$crypt_loop_1x__func4
+
+ testq %rdx,%rdx
+ jz L$reduce__func4
+
+L$partial_vec__func4:
+
+
+
+
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovq %rax,%k1
+ addq $15,%rdx
+ andq $-16,%rdx
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovq %rax,%k2
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpxord %zmm13,%zmm0,%zmm0
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_partialvec__func4:
+ vbroadcasti32x4 (%rax),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_partialvec__func4
+ vaesenclast %zmm14,%zmm0,%zmm0
+
+
+ vmovdqu8 (%rdi),%zmm1{%k1}{z}
+ vpxord %zmm1,%zmm0,%zmm0
+ vmovdqu8 %zmm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 (%r8),%zmm30{%k2}{z}
+
+ vpshufb %zmm8,%zmm1,%zmm0
+ vpxord %zmm10,%zmm0,%zmm0
+ vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
+ vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
+ vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
+ vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
+ vpxord %zmm7,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm2,%zmm1,%zmm5
+ vpxord %zmm3,%zmm6,%zmm6
+
+
+L$reduce__func4:
+
+ vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0
+ vpshufd $0x4e,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm0,%zmm4,%zmm5
+ vpclmulqdq $0x01,%zmm5,%zmm31,%zmm0
+ vpshufd $0x4e,%zmm5,%zmm5
+ vpternlogd $0x96,%zmm0,%zmm5,%zmm6
+
+ vextracti32x4 $1,%zmm6,%xmm0
+ vextracti32x4 $2,%zmm6,%xmm1
+ vextracti32x4 $3,%zmm6,%xmm2
+ vpxord %xmm0,%xmm6,%xmm10
+ vpternlogd $0x96,%xmm1,%xmm2,%xmm10
+
+
+L$done__func4:
+
+ vpshufb %xmm8,%xmm10,%xmm10
+ vmovdqu %xmm10,(%r12)
+
+ vzeroupper
+ popq %r12
+
+ ret
+
+
+
+#endif
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-linux.S b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
new file mode 100644
index 0000000..cf661c8
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
@@ -0,0 +1,2274 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.section .rodata
+.align 64
+
+
+.Lbswap_mask:
+.quad 0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+.Lgfpoly:
+.quad 1, 0xc200000000000000
+
+
+.Lgfpoly_and_internal_carrybit:
+.quad 1, 0xc200000000000001
+
+
+
+
+
+.Lctr_pattern:
+.quad 0, 0
+.quad 1, 0
+.Linc_2blocks:
+.quad 2, 0
+.quad 3, 0
+.Linc_4blocks:
+.quad 4, 0
+
+.text
+.globl gcm_gmult_vpclmulqdq_avx10
+.hidden gcm_gmult_vpclmulqdq_avx10
+.type gcm_gmult_vpclmulqdq_avx10,@function
+.align 32
+gcm_gmult_vpclmulqdq_avx10:
+.cfi_startproc
+
+_CET_ENDBR
+
+
+
+ vmovdqu (%rdi),%xmm0
+ vmovdqu .Lbswap_mask(%rip),%xmm1
+ vmovdqu 256-16(%rsi),%xmm2
+ vmovdqu .Lgfpoly(%rip),%xmm3
+ vpshufb %xmm1,%xmm0,%xmm0
+
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6
+ vpxord %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6
+ vpshufd $0x4e,%xmm4,%xmm4
+ vpternlogd $0x96,%xmm6,%xmm4,%xmm5
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4
+ vpshufd $0x4e,%xmm5,%xmm5
+ vpternlogd $0x96,%xmm4,%xmm5,%xmm0
+
+
+ vpshufb %xmm1,%xmm0,%xmm0
+ vmovdqu %xmm0,(%rdi)
+ ret
+
+.cfi_endproc
+.size gcm_gmult_vpclmulqdq_avx10, . - gcm_gmult_vpclmulqdq_avx10
+.globl gcm_init_vpclmulqdq_avx10
+.hidden gcm_init_vpclmulqdq_avx10
+.type gcm_init_vpclmulqdq_avx10,@function
+.align 32
+gcm_init_vpclmulqdq_avx10:
+.cfi_startproc
+
+_CET_ENDBR
+
+ leaq 256-32(%rdi),%r8
+
+
+
+ vpshufd $0x4e,(%rsi),%xmm3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vpshufd $0xd3,%xmm3,%xmm0
+ vpsrad $31,%xmm0,%xmm0
+ vpaddq %xmm3,%xmm3,%xmm3
+
+ vpternlogd $0x78,.Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm3
+
+
+ vbroadcasti32x4 .Lgfpoly(%rip),%ymm5
+
+
+
+
+
+
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0
+ vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1
+ vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2
+ vpxord %xmm2,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2
+ vpshufd $0x4e,%xmm0,%xmm0
+ vpternlogd $0x96,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm3,%xmm4
+ vpclmulqdq $0x01,%xmm1,%xmm5,%xmm0
+ vpshufd $0x4e,%xmm1,%xmm1
+ vpternlogd $0x96,%xmm0,%xmm1,%xmm4
+
+
+
+ vinserti128 $1,%xmm3,%ymm4,%ymm3
+ vinserti128 $1,%xmm4,%ymm4,%ymm4
+
+ vmovdqu8 %ymm3,(%r8)
+
+
+
+
+
+ movl $7,%eax
+.Lprecompute_next__func1:
+ subq $32,%r8
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm0
+ vpclmulqdq $0x01,%ymm4,%ymm3,%ymm1
+ vpclmulqdq $0x10,%ymm4,%ymm3,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2
+ vpshufd $0x4e,%ymm0,%ymm0
+ vpternlogd $0x96,%ymm2,%ymm0,%ymm1
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm3
+ vpclmulqdq $0x01,%ymm1,%ymm5,%ymm0
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpternlogd $0x96,%ymm0,%ymm1,%ymm3
+
+ vmovdqu8 %ymm3,(%r8)
+ decl %eax
+ jnz .Lprecompute_next__func1
+
+ vzeroupper
+ ret
+
+.cfi_endproc
+.size gcm_init_vpclmulqdq_avx10, . - gcm_init_vpclmulqdq_avx10
+.globl gcm_ghash_vpclmulqdq_avx10_256
+.hidden gcm_ghash_vpclmulqdq_avx10_256
+.type gcm_ghash_vpclmulqdq_avx10_256,@function
+.align 32
+gcm_ghash_vpclmulqdq_avx10_256:
+.cfi_startproc
+
+_CET_ENDBR
+
+
+
+
+
+
+ vmovdqu .Lbswap_mask(%rip),%xmm4
+ vmovdqu .Lgfpoly(%rip),%xmm10
+
+
+ vmovdqu (%rdi),%xmm5
+ vpshufb %xmm4,%xmm5,%xmm5
+
+
+ cmpq $32,%rcx
+ jb .Laad_blockbyblock__func1
+
+
+
+ vshufi64x2 $0,%ymm4,%ymm4,%ymm4
+ vshufi64x2 $0,%ymm10,%ymm10,%ymm10
+
+
+ vmovdqu8 256-32(%rsi),%ymm9
+
+ cmpq $128-1,%rcx
+ jbe .Laad_loop_1x__func1
+
+
+ vmovdqu8 256-128(%rsi),%ymm6
+ vmovdqu8 256-96(%rsi),%ymm7
+ vmovdqu8 256-64(%rsi),%ymm8
+
+
+.Laad_loop_4x__func1:
+ vmovdqu8 0(%rdx),%ymm0
+ vmovdqu8 32(%rdx),%ymm1
+ vmovdqu8 64(%rdx),%ymm2
+ vmovdqu8 96(%rdx),%ymm3
+ vpshufb %ymm4,%ymm0,%ymm0
+ vpxord %ymm5,%ymm0,%ymm0
+ vpshufb %ymm4,%ymm1,%ymm1
+ vpshufb %ymm4,%ymm2,%ymm2
+ vpshufb %ymm4,%ymm3,%ymm3
+ vpclmulqdq $0x00,%ymm6,%ymm0,%ymm5
+ vpclmulqdq $0x00,%ymm7,%ymm1,%ymm11
+ vpclmulqdq $0x00,%ymm8,%ymm2,%ymm12
+ vpxord %ymm11,%ymm5,%ymm5
+ vpclmulqdq $0x00,%ymm9,%ymm3,%ymm13
+ vpternlogd $0x96,%ymm13,%ymm12,%ymm5
+ vpclmulqdq $0x01,%ymm6,%ymm0,%ymm11
+ vpclmulqdq $0x01,%ymm7,%ymm1,%ymm12
+ vpclmulqdq $0x01,%ymm8,%ymm2,%ymm13
+ vpternlogd $0x96,%ymm13,%ymm12,%ymm11
+ vpclmulqdq $0x01,%ymm9,%ymm3,%ymm12
+ vpclmulqdq $0x10,%ymm6,%ymm0,%ymm13
+ vpternlogd $0x96,%ymm13,%ymm12,%ymm11
+ vpclmulqdq $0x10,%ymm7,%ymm1,%ymm12
+ vpclmulqdq $0x10,%ymm8,%ymm2,%ymm13
+ vpternlogd $0x96,%ymm13,%ymm12,%ymm11
+ vpclmulqdq $0x01,%ymm5,%ymm10,%ymm13
+ vpclmulqdq $0x10,%ymm9,%ymm3,%ymm12
+ vpxord %ymm12,%ymm11,%ymm11
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm6,%ymm0,%ymm0
+ vpclmulqdq $0x11,%ymm7,%ymm1,%ymm1
+ vpclmulqdq $0x11,%ymm8,%ymm2,%ymm2
+ vpternlogd $0x96,%ymm13,%ymm5,%ymm11
+ vpclmulqdq $0x11,%ymm9,%ymm3,%ymm3
+ vpternlogd $0x96,%ymm2,%ymm1,%ymm0
+ vpclmulqdq $0x01,%ymm11,%ymm10,%ymm12
+ vpxord %ymm3,%ymm0,%ymm5
+ vpshufd $0x4e,%ymm11,%ymm11
+ vpternlogd $0x96,%ymm12,%ymm11,%ymm5
+ vextracti32x4 $1,%ymm5,%xmm0
+ vpxord %xmm0,%xmm5,%xmm5
+
+ subq $-128,%rdx
+ addq $-128,%rcx
+ cmpq $128-1,%rcx
+ ja .Laad_loop_4x__func1
+
+
+ cmpq $32,%rcx
+ jb .Laad_large_done__func1
+.Laad_loop_1x__func1:
+ vmovdqu8 (%rdx),%ymm0
+ vpshufb %ymm4,%ymm0,%ymm0
+ vpxord %ymm0,%ymm5,%ymm5
+ vpclmulqdq $0x00,%ymm9,%ymm5,%ymm0
+ vpclmulqdq $0x01,%ymm9,%ymm5,%ymm1
+ vpclmulqdq $0x10,%ymm9,%ymm5,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm10,%ymm2
+ vpshufd $0x4e,%ymm0,%ymm0
+ vpternlogd $0x96,%ymm2,%ymm0,%ymm1
+ vpclmulqdq $0x11,%ymm9,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm1,%ymm10,%ymm0
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpternlogd $0x96,%ymm0,%ymm1,%ymm5
+
+ vextracti32x4 $1,%ymm5,%xmm0
+ vpxord %xmm0,%xmm5,%xmm5
+
+ addq $32,%rdx
+ subq $32,%rcx
+ cmpq $32,%rcx
+ jae .Laad_loop_1x__func1
+
+.Laad_large_done__func1:
+
+
+ vzeroupper
+
+
+.Laad_blockbyblock__func1:
+ testq %rcx,%rcx
+ jz .Laad_done__func1
+ vmovdqu 256-16(%rsi),%xmm9
+.Laad_loop_blockbyblock__func1:
+ vmovdqu (%rdx),%xmm0
+ vpshufb %xmm4,%xmm0,%xmm0
+ vpxor %xmm0,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm9,%xmm5,%xmm0
+ vpclmulqdq $0x01,%xmm9,%xmm5,%xmm1
+ vpclmulqdq $0x10,%xmm9,%xmm5,%xmm2
+ vpxord %xmm2,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm0,%xmm10,%xmm2
+ vpshufd $0x4e,%xmm0,%xmm0
+ vpternlogd $0x96,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x11,%xmm9,%xmm5,%xmm5
+ vpclmulqdq $0x01,%xmm1,%xmm10,%xmm0
+ vpshufd $0x4e,%xmm1,%xmm1
+ vpternlogd $0x96,%xmm0,%xmm1,%xmm5
+
+ addq $16,%rdx
+ subq $16,%rcx
+ jnz .Laad_loop_blockbyblock__func1
+
+.Laad_done__func1:
+
+ vpshufb %xmm4,%xmm5,%xmm5
+ vmovdqu %xmm5,(%rdi)
+ ret
+
+.cfi_endproc
+.size gcm_ghash_vpclmulqdq_avx10_256, . - gcm_ghash_vpclmulqdq_avx10_256
+.globl aes_gcm_enc_update_vaes_avx10_256
+.hidden aes_gcm_enc_update_vaes_avx10_256
+.type aes_gcm_enc_update_vaes_avx10_256,@function
+.align 32
+aes_gcm_enc_update_vaes_avx10_256:
+.cfi_startproc
+
+_CET_ENDBR
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+
+ movq 16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+ movb $1,BORINGSSL_function_hit+6(%rip)
+#endif
+
+ vbroadcasti32x4 .Lbswap_mask(%rip),%ymm8
+ vbroadcasti32x4 .Lgfpoly(%rip),%ymm31
+
+
+
+ vmovdqu (%r12),%xmm10
+ vpshufb %xmm8,%xmm10,%xmm10
+ vbroadcasti32x4 (%r8),%ymm12
+ vpshufb %ymm8,%ymm12,%ymm12
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti32x4 (%rcx),%ymm13
+ vbroadcasti32x4 (%r11),%ymm14
+
+
+ vpaddd .Lctr_pattern(%rip),%ymm12,%ymm12
+
+
+ vbroadcasti32x4 .Linc_2blocks(%rip),%ymm11
+
+
+
+ cmpq $128-1,%rdx
+ jbe .Lcrypt_loop_4x_done__func1
+
+
+ vmovdqu8 256-128(%r9),%ymm27
+ vmovdqu8 256-96(%r9),%ymm28
+ vmovdqu8 256-64(%r9),%ymm29
+ vmovdqu8 256-32(%r9),%ymm30
+
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm1
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm2
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm3
+ vpaddd %ymm11,%ymm12,%ymm12
+
+
+ vpxord %ymm13,%ymm0,%ymm0
+ vpxord %ymm13,%ymm1,%ymm1
+ vpxord %ymm13,%ymm2,%ymm2
+ vpxord %ymm13,%ymm3,%ymm3
+
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_first_4_vecs__func1:
+ vbroadcasti32x4 (%rax),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_first_4_vecs__func1
+
+
+
+ vpxord 0(%rdi),%ymm14,%ymm4
+ vpxord 32(%rdi),%ymm14,%ymm5
+ vpxord 64(%rdi),%ymm14,%ymm6
+ vpxord 96(%rdi),%ymm14,%ymm7
+
+
+
+ vaesenclast %ymm4,%ymm0,%ymm4
+ vaesenclast %ymm5,%ymm1,%ymm5
+ vaesenclast %ymm6,%ymm2,%ymm6
+ vaesenclast %ymm7,%ymm3,%ymm7
+
+
+ vmovdqu8 %ymm4,0(%rsi)
+ vmovdqu8 %ymm5,32(%rsi)
+ vmovdqu8 %ymm6,64(%rsi)
+ vmovdqu8 %ymm7,96(%rsi)
+
+ subq $-128,%rdi
+ subq $-128,%rsi
+ addq $-128,%rdx
+ cmpq $128-1,%rdx
+ jbe .Lghash_last_ciphertext_4x__func1
+ vbroadcasti32x4 -144(%r11),%ymm15
+ vbroadcasti32x4 -128(%r11),%ymm16
+ vbroadcasti32x4 -112(%r11),%ymm17
+ vbroadcasti32x4 -96(%r11),%ymm18
+ vbroadcasti32x4 -80(%r11),%ymm19
+ vbroadcasti32x4 -64(%r11),%ymm20
+ vbroadcasti32x4 -48(%r11),%ymm21
+ vbroadcasti32x4 -32(%r11),%ymm22
+ vbroadcasti32x4 -16(%r11),%ymm23
+.Lcrypt_loop_4x__func1:
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm1
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm2
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm3
+ vpaddd %ymm11,%ymm12,%ymm12
+
+
+ vpxord %ymm13,%ymm0,%ymm0
+ vpxord %ymm13,%ymm1,%ymm1
+ vpxord %ymm13,%ymm2,%ymm2
+ vpxord %ymm13,%ymm3,%ymm3
+
+ cmpl $24,%r10d
+ jl .Laes128__func1
+ je .Laes192__func1
+
+ vbroadcasti32x4 -208(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+ vbroadcasti32x4 -192(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+.Laes192__func1:
+ vbroadcasti32x4 -176(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+ vbroadcasti32x4 -160(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+.Laes128__func1:
+ vpshufb %ymm8,%ymm4,%ymm4
+ vpxord %ymm10,%ymm4,%ymm4
+ vpshufb %ymm8,%ymm5,%ymm5
+ vpshufb %ymm8,%ymm6,%ymm6
+
+ vaesenc %ymm15,%ymm0,%ymm0
+ vaesenc %ymm15,%ymm1,%ymm1
+ vaesenc %ymm15,%ymm2,%ymm2
+ vaesenc %ymm15,%ymm3,%ymm3
+
+ vpshufb %ymm8,%ymm7,%ymm7
+ vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
+ vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
+ vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
+
+ vaesenc %ymm16,%ymm0,%ymm0
+ vaesenc %ymm16,%ymm1,%ymm1
+ vaesenc %ymm16,%ymm2,%ymm2
+ vaesenc %ymm16,%ymm3,%ymm3
+
+ vpxord %ymm24,%ymm10,%ymm10
+ vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm10
+ vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
+
+ vaesenc %ymm17,%ymm0,%ymm0
+ vaesenc %ymm17,%ymm1,%ymm1
+ vaesenc %ymm17,%ymm2,%ymm2
+ vaesenc %ymm17,%ymm3,%ymm3
+
+ vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
+
+ vaesenc %ymm18,%ymm0,%ymm0
+ vaesenc %ymm18,%ymm1,%ymm1
+ vaesenc %ymm18,%ymm2,%ymm2
+ vaesenc %ymm18,%ymm3,%ymm3
+
+ vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
+
+ vaesenc %ymm19,%ymm0,%ymm0
+ vaesenc %ymm19,%ymm1,%ymm1
+ vaesenc %ymm19,%ymm2,%ymm2
+ vaesenc %ymm19,%ymm3,%ymm3
+
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
+ vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
+ vpxord %ymm25,%ymm24,%ymm24
+
+ vaesenc %ymm20,%ymm0,%ymm0
+ vaesenc %ymm20,%ymm1,%ymm1
+ vaesenc %ymm20,%ymm2,%ymm2
+ vaesenc %ymm20,%ymm3,%ymm3
+
+ vpshufd $0x4e,%ymm10,%ymm10
+ vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
+ vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
+
+ vaesenc %ymm21,%ymm0,%ymm0
+ vaesenc %ymm21,%ymm1,%ymm1
+ vaesenc %ymm21,%ymm2,%ymm2
+ vaesenc %ymm21,%ymm3,%ymm3
+
+ vpternlogd $0x96,%ymm26,%ymm10,%ymm24
+ vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
+ vpternlogd $0x96,%ymm6,%ymm5,%ymm4
+ vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
+
+ vaesenc %ymm22,%ymm0,%ymm0
+ vaesenc %ymm22,%ymm1,%ymm1
+ vaesenc %ymm22,%ymm2,%ymm2
+ vaesenc %ymm22,%ymm3,%ymm3
+
+ vpxord %ymm7,%ymm4,%ymm10
+ vpshufd $0x4e,%ymm24,%ymm24
+ vpternlogd $0x96,%ymm25,%ymm24,%ymm10
+
+ vaesenc %ymm23,%ymm0,%ymm0
+ vaesenc %ymm23,%ymm1,%ymm1
+ vaesenc %ymm23,%ymm2,%ymm2
+ vaesenc %ymm23,%ymm3,%ymm3
+
+ vextracti32x4 $1,%ymm10,%xmm4
+ vpxord %xmm4,%xmm10,%xmm10
+
+
+
+
+ vpxord 0(%rdi),%ymm14,%ymm4
+ vpxord 32(%rdi),%ymm14,%ymm5
+ vpxord 64(%rdi),%ymm14,%ymm6
+ vpxord 96(%rdi),%ymm14,%ymm7
+
+
+
+ vaesenclast %ymm4,%ymm0,%ymm4
+ vaesenclast %ymm5,%ymm1,%ymm5
+ vaesenclast %ymm6,%ymm2,%ymm6
+ vaesenclast %ymm7,%ymm3,%ymm7
+
+
+ vmovdqu8 %ymm4,0(%rsi)
+ vmovdqu8 %ymm5,32(%rsi)
+ vmovdqu8 %ymm6,64(%rsi)
+ vmovdqu8 %ymm7,96(%rsi)
+
+ subq $-128,%rdi
+ subq $-128,%rsi
+ addq $-128,%rdx
+ cmpq $128-1,%rdx
+ ja .Lcrypt_loop_4x__func1
+.Lghash_last_ciphertext_4x__func1:
+ vpshufb %ymm8,%ymm4,%ymm4
+ vpxord %ymm10,%ymm4,%ymm4
+ vpshufb %ymm8,%ymm5,%ymm5
+ vpshufb %ymm8,%ymm6,%ymm6
+ vpshufb %ymm8,%ymm7,%ymm7
+ vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
+ vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
+ vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
+ vpxord %ymm24,%ymm10,%ymm10
+ vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm10
+ vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
+ vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
+ vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
+ vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
+ vpxord %ymm25,%ymm24,%ymm24
+ vpshufd $0x4e,%ymm10,%ymm10
+ vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
+ vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
+ vpternlogd $0x96,%ymm26,%ymm10,%ymm24
+ vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
+ vpternlogd $0x96,%ymm6,%ymm5,%ymm4
+ vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
+ vpxord %ymm7,%ymm4,%ymm10
+ vpshufd $0x4e,%ymm24,%ymm24
+ vpternlogd $0x96,%ymm25,%ymm24,%ymm10
+ vextracti32x4 $1,%ymm10,%xmm4
+ vpxord %xmm4,%xmm10,%xmm10
+
+.Lcrypt_loop_4x_done__func1:
+
+ testq %rdx,%rdx
+ jz .Ldone__func1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdx,%rax
+ negq %rax
+ andq $-16,%rax
+ leaq 256(%r9,%rax,1),%r8
+ vpxor %xmm4,%xmm4,%xmm4
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+
+ cmpq $32,%rdx
+ jb .Lpartial_vec__func1
+
+.Lcrypt_loop_1x__func1:
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpxord %ymm13,%ymm0,%ymm0
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_full_vec__func1:
+ vbroadcasti32x4 (%rax),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_full_vec__func1
+ vaesenclast %ymm14,%ymm0,%ymm0
+
+
+ vmovdqu8 (%rdi),%ymm1
+ vpxord %ymm1,%ymm0,%ymm0
+ vmovdqu8 %ymm0,(%rsi)
+
+
+ vmovdqu8 (%r8),%ymm30
+ vpshufb %ymm8,%ymm0,%ymm0
+ vpxord %ymm10,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
+ vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
+ vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
+ vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
+ vpxord %ymm7,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm2,%ymm1,%ymm5
+ vpxord %ymm3,%ymm6,%ymm6
+
+ vpxor %xmm10,%xmm10,%xmm10
+
+ addq $32,%r8
+ addq $32,%rdi
+ addq $32,%rsi
+ subq $32,%rdx
+ cmpq $32,%rdx
+ jae .Lcrypt_loop_1x__func1
+
+ testq %rdx,%rdx
+ jz .Lreduce__func1
+
+.Lpartial_vec__func1:
+
+
+
+
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovd %eax,%k1
+ addq $15,%rdx
+ andq $-16,%rdx
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovd %eax,%k2
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpxord %ymm13,%ymm0,%ymm0
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_partialvec__func1:
+ vbroadcasti32x4 (%rax),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_partialvec__func1
+ vaesenclast %ymm14,%ymm0,%ymm0
+
+
+ vmovdqu8 (%rdi),%ymm1{%k1}{z}
+ vpxord %ymm1,%ymm0,%ymm0
+ vmovdqu8 %ymm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 (%r8),%ymm30{%k2}{z}
+ vmovdqu8 %ymm0,%ymm1{%k1}{z}
+ vpshufb %ymm8,%ymm1,%ymm0
+ vpxord %ymm10,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
+ vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
+ vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
+ vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
+ vpxord %ymm7,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm2,%ymm1,%ymm5
+ vpxord %ymm3,%ymm6,%ymm6
+
+
+.Lreduce__func1:
+
+ vpclmulqdq $0x01,%ymm4,%ymm31,%ymm0
+ vpshufd $0x4e,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm0,%ymm4,%ymm5
+ vpclmulqdq $0x01,%ymm5,%ymm31,%ymm0
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpternlogd $0x96,%ymm0,%ymm5,%ymm6
+
+ vextracti32x4 $1,%ymm6,%xmm0
+ vpxord %xmm0,%xmm6,%xmm10
+
+
+.Ldone__func1:
+
+ vpshufb %xmm8,%xmm10,%xmm10
+ vmovdqu %xmm10,(%r12)
+
+ vzeroupper
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ ret
+
+.cfi_endproc
+.size aes_gcm_enc_update_vaes_avx10_256, . - aes_gcm_enc_update_vaes_avx10_256
+.globl aes_gcm_dec_update_vaes_avx10_256
+.hidden aes_gcm_dec_update_vaes_avx10_256
+.type aes_gcm_dec_update_vaes_avx10_256,@function
+.align 32
+aes_gcm_dec_update_vaes_avx10_256:
+.cfi_startproc
+
+_CET_ENDBR
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+
+ movq 16(%rsp),%r12
+
+ vbroadcasti32x4 .Lbswap_mask(%rip),%ymm8
+ vbroadcasti32x4 .Lgfpoly(%rip),%ymm31
+
+
+
+ vmovdqu (%r12),%xmm10
+ vpshufb %xmm8,%xmm10,%xmm10
+ vbroadcasti32x4 (%r8),%ymm12
+ vpshufb %ymm8,%ymm12,%ymm12
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti32x4 (%rcx),%ymm13
+ vbroadcasti32x4 (%r11),%ymm14
+
+
+ vpaddd .Lctr_pattern(%rip),%ymm12,%ymm12
+
+
+ vbroadcasti32x4 .Linc_2blocks(%rip),%ymm11
+
+
+
+ cmpq $128-1,%rdx
+ jbe .Lcrypt_loop_4x_done__func2
+
+
+ vmovdqu8 256-128(%r9),%ymm27
+ vmovdqu8 256-96(%r9),%ymm28
+ vmovdqu8 256-64(%r9),%ymm29
+ vmovdqu8 256-32(%r9),%ymm30
+ vbroadcasti32x4 -144(%r11),%ymm15
+ vbroadcasti32x4 -128(%r11),%ymm16
+ vbroadcasti32x4 -112(%r11),%ymm17
+ vbroadcasti32x4 -96(%r11),%ymm18
+ vbroadcasti32x4 -80(%r11),%ymm19
+ vbroadcasti32x4 -64(%r11),%ymm20
+ vbroadcasti32x4 -48(%r11),%ymm21
+ vbroadcasti32x4 -32(%r11),%ymm22
+ vbroadcasti32x4 -16(%r11),%ymm23
+.Lcrypt_loop_4x__func2:
+ vmovdqu8 0(%rdi),%ymm4
+ vmovdqu8 32(%rdi),%ymm5
+ vmovdqu8 64(%rdi),%ymm6
+ vmovdqu8 96(%rdi),%ymm7
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm1
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm2
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm12,%ymm3
+ vpaddd %ymm11,%ymm12,%ymm12
+
+
+ vpxord %ymm13,%ymm0,%ymm0
+ vpxord %ymm13,%ymm1,%ymm1
+ vpxord %ymm13,%ymm2,%ymm2
+ vpxord %ymm13,%ymm3,%ymm3
+
+ cmpl $24,%r10d
+ jl .Laes128__func2
+ je .Laes192__func2
+
+ vbroadcasti32x4 -208(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+ vbroadcasti32x4 -192(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+.Laes192__func2:
+ vbroadcasti32x4 -176(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+ vbroadcasti32x4 -160(%r11),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ vaesenc %ymm9,%ymm1,%ymm1
+ vaesenc %ymm9,%ymm2,%ymm2
+ vaesenc %ymm9,%ymm3,%ymm3
+
+.Laes128__func2:
+ vpshufb %ymm8,%ymm4,%ymm4
+ vpxord %ymm10,%ymm4,%ymm4
+ vpshufb %ymm8,%ymm5,%ymm5
+ vpshufb %ymm8,%ymm6,%ymm6
+
+ vaesenc %ymm15,%ymm0,%ymm0
+ vaesenc %ymm15,%ymm1,%ymm1
+ vaesenc %ymm15,%ymm2,%ymm2
+ vaesenc %ymm15,%ymm3,%ymm3
+
+ vpshufb %ymm8,%ymm7,%ymm7
+ vpclmulqdq $0x00,%ymm27,%ymm4,%ymm10
+ vpclmulqdq $0x00,%ymm28,%ymm5,%ymm24
+ vpclmulqdq $0x00,%ymm29,%ymm6,%ymm25
+
+ vaesenc %ymm16,%ymm0,%ymm0
+ vaesenc %ymm16,%ymm1,%ymm1
+ vaesenc %ymm16,%ymm2,%ymm2
+ vaesenc %ymm16,%ymm3,%ymm3
+
+ vpxord %ymm24,%ymm10,%ymm10
+ vpclmulqdq $0x00,%ymm30,%ymm7,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm10
+ vpclmulqdq $0x01,%ymm27,%ymm4,%ymm24
+
+ vaesenc %ymm17,%ymm0,%ymm0
+ vaesenc %ymm17,%ymm1,%ymm1
+ vaesenc %ymm17,%ymm2,%ymm2
+ vaesenc %ymm17,%ymm3,%ymm3
+
+ vpclmulqdq $0x01,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x01,%ymm29,%ymm6,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm30,%ymm7,%ymm25
+
+ vaesenc %ymm18,%ymm0,%ymm0
+ vaesenc %ymm18,%ymm1,%ymm1
+ vaesenc %ymm18,%ymm2,%ymm2
+ vaesenc %ymm18,%ymm3,%ymm3
+
+ vpclmulqdq $0x10,%ymm27,%ymm4,%ymm26
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x10,%ymm28,%ymm5,%ymm25
+ vpclmulqdq $0x10,%ymm29,%ymm6,%ymm26
+
+ vaesenc %ymm19,%ymm0,%ymm0
+ vaesenc %ymm19,%ymm1,%ymm1
+ vaesenc %ymm19,%ymm2,%ymm2
+ vaesenc %ymm19,%ymm3,%ymm3
+
+ vpternlogd $0x96,%ymm26,%ymm25,%ymm24
+ vpclmulqdq $0x01,%ymm10,%ymm31,%ymm26
+ vpclmulqdq $0x10,%ymm30,%ymm7,%ymm25
+ vpxord %ymm25,%ymm24,%ymm24
+
+ vaesenc %ymm20,%ymm0,%ymm0
+ vaesenc %ymm20,%ymm1,%ymm1
+ vaesenc %ymm20,%ymm2,%ymm2
+ vaesenc %ymm20,%ymm3,%ymm3
+
+ vpshufd $0x4e,%ymm10,%ymm10
+ vpclmulqdq $0x11,%ymm27,%ymm4,%ymm4
+ vpclmulqdq $0x11,%ymm28,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm29,%ymm6,%ymm6
+
+ vaesenc %ymm21,%ymm0,%ymm0
+ vaesenc %ymm21,%ymm1,%ymm1
+ vaesenc %ymm21,%ymm2,%ymm2
+ vaesenc %ymm21,%ymm3,%ymm3
+
+ vpternlogd $0x96,%ymm26,%ymm10,%ymm24
+ vpclmulqdq $0x11,%ymm30,%ymm7,%ymm7
+ vpternlogd $0x96,%ymm6,%ymm5,%ymm4
+ vpclmulqdq $0x01,%ymm24,%ymm31,%ymm25
+
+ vaesenc %ymm22,%ymm0,%ymm0
+ vaesenc %ymm22,%ymm1,%ymm1
+ vaesenc %ymm22,%ymm2,%ymm2
+ vaesenc %ymm22,%ymm3,%ymm3
+
+ vpxord %ymm7,%ymm4,%ymm10
+ vpshufd $0x4e,%ymm24,%ymm24
+ vpternlogd $0x96,%ymm25,%ymm24,%ymm10
+
+ vaesenc %ymm23,%ymm0,%ymm0
+ vaesenc %ymm23,%ymm1,%ymm1
+ vaesenc %ymm23,%ymm2,%ymm2
+ vaesenc %ymm23,%ymm3,%ymm3
+
+ vextracti32x4 $1,%ymm10,%xmm4
+ vpxord %xmm4,%xmm10,%xmm10
+
+
+
+
+ vpxord 0(%rdi),%ymm14,%ymm4
+ vpxord 32(%rdi),%ymm14,%ymm5
+ vpxord 64(%rdi),%ymm14,%ymm6
+ vpxord 96(%rdi),%ymm14,%ymm7
+
+
+
+ vaesenclast %ymm4,%ymm0,%ymm4
+ vaesenclast %ymm5,%ymm1,%ymm5
+ vaesenclast %ymm6,%ymm2,%ymm6
+ vaesenclast %ymm7,%ymm3,%ymm7
+
+
+ vmovdqu8 %ymm4,0(%rsi)
+ vmovdqu8 %ymm5,32(%rsi)
+ vmovdqu8 %ymm6,64(%rsi)
+ vmovdqu8 %ymm7,96(%rsi)
+
+ subq $-128,%rdi
+ subq $-128,%rsi
+ addq $-128,%rdx
+ cmpq $128-1,%rdx
+ ja .Lcrypt_loop_4x__func2
+.Lcrypt_loop_4x_done__func2:
+
+ testq %rdx,%rdx
+ jz .Ldone__func2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdx,%rax
+ negq %rax
+ andq $-16,%rax
+ leaq 256(%r9,%rax,1),%r8
+ vpxor %xmm4,%xmm4,%xmm4
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+
+ cmpq $32,%rdx
+ jb .Lpartial_vec__func2
+
+.Lcrypt_loop_1x__func2:
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpxord %ymm13,%ymm0,%ymm0
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_full_vec__func2:
+ vbroadcasti32x4 (%rax),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_full_vec__func2
+ vaesenclast %ymm14,%ymm0,%ymm0
+
+
+ vmovdqu8 (%rdi),%ymm1
+ vpxord %ymm1,%ymm0,%ymm0
+ vmovdqu8 %ymm0,(%rsi)
+
+
+ vmovdqu8 (%r8),%ymm30
+ vpshufb %ymm8,%ymm1,%ymm0
+ vpxord %ymm10,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
+ vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
+ vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
+ vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
+ vpxord %ymm7,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm2,%ymm1,%ymm5
+ vpxord %ymm3,%ymm6,%ymm6
+
+ vpxor %xmm10,%xmm10,%xmm10
+
+ addq $32,%r8
+ addq $32,%rdi
+ addq $32,%rsi
+ subq $32,%rdx
+ cmpq $32,%rdx
+ jae .Lcrypt_loop_1x__func2
+
+ testq %rdx,%rdx
+ jz .Lreduce__func2
+
+.Lpartial_vec__func2:
+
+
+
+
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovd %eax,%k1
+ addq $15,%rdx
+ andq $-16,%rdx
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovd %eax,%k2
+
+
+
+ vpshufb %ymm8,%ymm12,%ymm0
+ vpxord %ymm13,%ymm0,%ymm0
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_partialvec__func2:
+ vbroadcasti32x4 (%rax),%ymm9
+ vaesenc %ymm9,%ymm0,%ymm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_partialvec__func2
+ vaesenclast %ymm14,%ymm0,%ymm0
+
+
+ vmovdqu8 (%rdi),%ymm1{%k1}{z}
+ vpxord %ymm1,%ymm0,%ymm0
+ vmovdqu8 %ymm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 (%r8),%ymm30{%k2}{z}
+
+ vpshufb %ymm8,%ymm1,%ymm0
+ vpxord %ymm10,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm30,%ymm0,%ymm7
+ vpclmulqdq $0x01,%ymm30,%ymm0,%ymm1
+ vpclmulqdq $0x10,%ymm30,%ymm0,%ymm2
+ vpclmulqdq $0x11,%ymm30,%ymm0,%ymm3
+ vpxord %ymm7,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm2,%ymm1,%ymm5
+ vpxord %ymm3,%ymm6,%ymm6
+
+
+.Lreduce__func2:
+
+ vpclmulqdq $0x01,%ymm4,%ymm31,%ymm0
+ vpshufd $0x4e,%ymm4,%ymm4
+ vpternlogd $0x96,%ymm0,%ymm4,%ymm5
+ vpclmulqdq $0x01,%ymm5,%ymm31,%ymm0
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpternlogd $0x96,%ymm0,%ymm5,%ymm6
+
+ vextracti32x4 $1,%ymm6,%xmm0
+ vpxord %xmm0,%xmm6,%xmm10
+
+
+.Ldone__func2:
+
+ vpshufb %xmm8,%xmm10,%xmm10
+ vmovdqu %xmm10,(%r12)
+
+ vzeroupper
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ ret
+
+.cfi_endproc
+.size aes_gcm_dec_update_vaes_avx10_256, . - aes_gcm_dec_update_vaes_avx10_256
+.globl gcm_ghash_vpclmulqdq_avx10_512
+.hidden gcm_ghash_vpclmulqdq_avx10_512
+.type gcm_ghash_vpclmulqdq_avx10_512,@function
+.align 32
+gcm_ghash_vpclmulqdq_avx10_512:
+.cfi_startproc
+
+_CET_ENDBR
+
+
+
+
+
+
+ vmovdqu .Lbswap_mask(%rip),%xmm4
+ vmovdqu .Lgfpoly(%rip),%xmm10
+
+
+ vmovdqu (%rdi),%xmm5
+ vpshufb %xmm4,%xmm5,%xmm5
+
+
+ cmpq $64,%rcx
+ jb .Laad_blockbyblock__func2
+
+
+
+ vshufi64x2 $0,%zmm4,%zmm4,%zmm4
+ vshufi64x2 $0,%zmm10,%zmm10,%zmm10
+
+
+ vmovdqu8 256-64(%rsi),%zmm9
+
+ cmpq $256-1,%rcx
+ jbe .Laad_loop_1x__func2
+
+
+ vmovdqu8 256-256(%rsi),%zmm6
+ vmovdqu8 256-192(%rsi),%zmm7
+ vmovdqu8 256-128(%rsi),%zmm8
+
+
+.Laad_loop_4x__func2:
+ vmovdqu8 0(%rdx),%zmm0
+ vmovdqu8 64(%rdx),%zmm1
+ vmovdqu8 128(%rdx),%zmm2
+ vmovdqu8 192(%rdx),%zmm3
+ vpshufb %zmm4,%zmm0,%zmm0
+ vpxord %zmm5,%zmm0,%zmm0
+ vpshufb %zmm4,%zmm1,%zmm1
+ vpshufb %zmm4,%zmm2,%zmm2
+ vpshufb %zmm4,%zmm3,%zmm3
+ vpclmulqdq $0x00,%zmm6,%zmm0,%zmm5
+ vpclmulqdq $0x00,%zmm7,%zmm1,%zmm11
+ vpclmulqdq $0x00,%zmm8,%zmm2,%zmm12
+ vpxord %zmm11,%zmm5,%zmm5
+ vpclmulqdq $0x00,%zmm9,%zmm3,%zmm13
+ vpternlogd $0x96,%zmm13,%zmm12,%zmm5
+ vpclmulqdq $0x01,%zmm6,%zmm0,%zmm11
+ vpclmulqdq $0x01,%zmm7,%zmm1,%zmm12
+ vpclmulqdq $0x01,%zmm8,%zmm2,%zmm13
+ vpternlogd $0x96,%zmm13,%zmm12,%zmm11
+ vpclmulqdq $0x01,%zmm9,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm6,%zmm0,%zmm13
+ vpternlogd $0x96,%zmm13,%zmm12,%zmm11
+ vpclmulqdq $0x10,%zmm7,%zmm1,%zmm12
+ vpclmulqdq $0x10,%zmm8,%zmm2,%zmm13
+ vpternlogd $0x96,%zmm13,%zmm12,%zmm11
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm13
+ vpclmulqdq $0x10,%zmm9,%zmm3,%zmm12
+ vpxord %zmm12,%zmm11,%zmm11
+ vpshufd $0x4e,%zmm5,%zmm5
+ vpclmulqdq $0x11,%zmm6,%zmm0,%zmm0
+ vpclmulqdq $0x11,%zmm7,%zmm1,%zmm1
+ vpclmulqdq $0x11,%zmm8,%zmm2,%zmm2
+ vpternlogd $0x96,%zmm13,%zmm5,%zmm11
+ vpclmulqdq $0x11,%zmm9,%zmm3,%zmm3
+ vpternlogd $0x96,%zmm2,%zmm1,%zmm0
+ vpclmulqdq $0x01,%zmm11,%zmm10,%zmm12
+ vpxord %zmm3,%zmm0,%zmm5
+ vpshufd $0x4e,%zmm11,%zmm11
+ vpternlogd $0x96,%zmm12,%zmm11,%zmm5
+ vextracti32x4 $1,%zmm5,%xmm0
+ vextracti32x4 $2,%zmm5,%xmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vpxord %xmm0,%xmm5,%xmm5
+ vpternlogd $0x96,%xmm1,%xmm2,%xmm5
+
+ subq $-256,%rdx
+ addq $-256,%rcx
+ cmpq $256-1,%rcx
+ ja .Laad_loop_4x__func2
+
+
+ cmpq $64,%rcx
+ jb .Laad_large_done__func2
+.Laad_loop_1x__func2:
+ vmovdqu8 (%rdx),%zmm0
+ vpshufb %zmm4,%zmm0,%zmm0
+ vpxord %zmm0,%zmm5,%zmm5
+ vpclmulqdq $0x00,%zmm9,%zmm5,%zmm0
+ vpclmulqdq $0x01,%zmm9,%zmm5,%zmm1
+ vpclmulqdq $0x10,%zmm9,%zmm5,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm0,%zmm10,%zmm2
+ vpshufd $0x4e,%zmm0,%zmm0
+ vpternlogd $0x96,%zmm2,%zmm0,%zmm1
+ vpclmulqdq $0x11,%zmm9,%zmm5,%zmm5
+ vpclmulqdq $0x01,%zmm1,%zmm10,%zmm0
+ vpshufd $0x4e,%zmm1,%zmm1
+ vpternlogd $0x96,%zmm0,%zmm1,%zmm5
+
+ vextracti32x4 $1,%zmm5,%xmm0
+ vextracti32x4 $2,%zmm5,%xmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vpxord %xmm0,%xmm5,%xmm5
+ vpternlogd $0x96,%xmm1,%xmm2,%xmm5
+
+ addq $64,%rdx
+ subq $64,%rcx
+ cmpq $64,%rcx
+ jae .Laad_loop_1x__func2
+
+.Laad_large_done__func2:
+
+
+ vzeroupper
+
+
+.Laad_blockbyblock__func2:
+ testq %rcx,%rcx
+ jz .Laad_done__func2
+ vmovdqu 256-16(%rsi),%xmm9
+.Laad_loop_blockbyblock__func2:
+ vmovdqu (%rdx),%xmm0
+ vpshufb %xmm4,%xmm0,%xmm0
+ vpxor %xmm0,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm9,%xmm5,%xmm0
+ vpclmulqdq $0x01,%xmm9,%xmm5,%xmm1
+ vpclmulqdq $0x10,%xmm9,%xmm5,%xmm2
+ vpxord %xmm2,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm0,%xmm10,%xmm2
+ vpshufd $0x4e,%xmm0,%xmm0
+ vpternlogd $0x96,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x11,%xmm9,%xmm5,%xmm5
+ vpclmulqdq $0x01,%xmm1,%xmm10,%xmm0
+ vpshufd $0x4e,%xmm1,%xmm1
+ vpternlogd $0x96,%xmm0,%xmm1,%xmm5
+
+ addq $16,%rdx
+ subq $16,%rcx
+ jnz .Laad_loop_blockbyblock__func2
+
+.Laad_done__func2:
+
+ vpshufb %xmm4,%xmm5,%xmm5
+ vmovdqu %xmm5,(%rdi)
+ ret
+
+.cfi_endproc
+.size gcm_ghash_vpclmulqdq_avx10_512, . - gcm_ghash_vpclmulqdq_avx10_512
+.globl aes_gcm_enc_update_vaes_avx10_512
+.hidden aes_gcm_enc_update_vaes_avx10_512
+.type aes_gcm_enc_update_vaes_avx10_512,@function
+.align 32
+aes_gcm_enc_update_vaes_avx10_512:
+.cfi_startproc
+
+_CET_ENDBR
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+
+ movq 16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+ movb $1,BORINGSSL_function_hit+7(%rip)
+#endif
+
+ vbroadcasti32x4 .Lbswap_mask(%rip),%zmm8
+ vbroadcasti32x4 .Lgfpoly(%rip),%zmm31
+
+
+
+ vmovdqu (%r12),%xmm10
+ vpshufb %xmm8,%xmm10,%xmm10
+ vbroadcasti32x4 (%r8),%zmm12
+ vpshufb %zmm8,%zmm12,%zmm12
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti32x4 (%rcx),%zmm13
+ vbroadcasti32x4 (%r11),%zmm14
+
+
+ vpaddd .Lctr_pattern(%rip),%zmm12,%zmm12
+
+
+ vbroadcasti32x4 .Linc_4blocks(%rip),%zmm11
+
+
+
+ cmpq $256-1,%rdx
+ jbe .Lcrypt_loop_4x_done__func3
+
+
+ vmovdqu8 256-256(%r9),%zmm27
+ vmovdqu8 256-192(%r9),%zmm28
+ vmovdqu8 256-128(%r9),%zmm29
+ vmovdqu8 256-64(%r9),%zmm30
+
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm1
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm2
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm3
+ vpaddd %zmm11,%zmm12,%zmm12
+
+
+ vpxord %zmm13,%zmm0,%zmm0
+ vpxord %zmm13,%zmm1,%zmm1
+ vpxord %zmm13,%zmm2,%zmm2
+ vpxord %zmm13,%zmm3,%zmm3
+
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_first_4_vecs__func3:
+ vbroadcasti32x4 (%rax),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_first_4_vecs__func3
+
+
+
+ vpxord 0(%rdi),%zmm14,%zmm4
+ vpxord 64(%rdi),%zmm14,%zmm5
+ vpxord 128(%rdi),%zmm14,%zmm6
+ vpxord 192(%rdi),%zmm14,%zmm7
+
+
+
+ vaesenclast %zmm4,%zmm0,%zmm4
+ vaesenclast %zmm5,%zmm1,%zmm5
+ vaesenclast %zmm6,%zmm2,%zmm6
+ vaesenclast %zmm7,%zmm3,%zmm7
+
+
+ vmovdqu8 %zmm4,0(%rsi)
+ vmovdqu8 %zmm5,64(%rsi)
+ vmovdqu8 %zmm6,128(%rsi)
+ vmovdqu8 %zmm7,192(%rsi)
+
+ subq $-256,%rdi
+ subq $-256,%rsi
+ addq $-256,%rdx
+ cmpq $256-1,%rdx
+ jbe .Lghash_last_ciphertext_4x__func3
+ vbroadcasti32x4 -144(%r11),%zmm15
+ vbroadcasti32x4 -128(%r11),%zmm16
+ vbroadcasti32x4 -112(%r11),%zmm17
+ vbroadcasti32x4 -96(%r11),%zmm18
+ vbroadcasti32x4 -80(%r11),%zmm19
+ vbroadcasti32x4 -64(%r11),%zmm20
+ vbroadcasti32x4 -48(%r11),%zmm21
+ vbroadcasti32x4 -32(%r11),%zmm22
+ vbroadcasti32x4 -16(%r11),%zmm23
+.Lcrypt_loop_4x__func3:
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm1
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm2
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm3
+ vpaddd %zmm11,%zmm12,%zmm12
+
+
+ vpxord %zmm13,%zmm0,%zmm0
+ vpxord %zmm13,%zmm1,%zmm1
+ vpxord %zmm13,%zmm2,%zmm2
+ vpxord %zmm13,%zmm3,%zmm3
+
+ cmpl $24,%r10d
+ jl .Laes128__func3
+ je .Laes192__func3
+
+ vbroadcasti32x4 -208(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+ vbroadcasti32x4 -192(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+.Laes192__func3:
+ vbroadcasti32x4 -176(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+ vbroadcasti32x4 -160(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+.Laes128__func3:
+ vpshufb %zmm8,%zmm4,%zmm4
+ vpxord %zmm10,%zmm4,%zmm4
+ vpshufb %zmm8,%zmm5,%zmm5
+ vpshufb %zmm8,%zmm6,%zmm6
+
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm1,%zmm1
+ vaesenc %zmm15,%zmm2,%zmm2
+ vaesenc %zmm15,%zmm3,%zmm3
+
+ vpshufb %zmm8,%zmm7,%zmm7
+ vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10
+ vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24
+ vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25
+
+ vaesenc %zmm16,%zmm0,%zmm0
+ vaesenc %zmm16,%zmm1,%zmm1
+ vaesenc %zmm16,%zmm2,%zmm2
+ vaesenc %zmm16,%zmm3,%zmm3
+
+ vpxord %zmm24,%zmm10,%zmm10
+ vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm10
+ vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24
+
+ vaesenc %zmm17,%zmm0,%zmm0
+ vaesenc %zmm17,%zmm1,%zmm1
+ vaesenc %zmm17,%zmm2,%zmm2
+ vaesenc %zmm17,%zmm3,%zmm3
+
+ vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25
+
+ vaesenc %zmm18,%zmm0,%zmm0
+ vaesenc %zmm18,%zmm1,%zmm1
+ vaesenc %zmm18,%zmm2,%zmm2
+ vaesenc %zmm18,%zmm3,%zmm3
+
+ vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26
+
+ vaesenc %zmm19,%zmm0,%zmm0
+ vaesenc %zmm19,%zmm1,%zmm1
+ vaesenc %zmm19,%zmm2,%zmm2
+ vaesenc %zmm19,%zmm3,%zmm3
+
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26
+ vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25
+ vpxord %zmm25,%zmm24,%zmm24
+
+ vaesenc %zmm20,%zmm0,%zmm0
+ vaesenc %zmm20,%zmm1,%zmm1
+ vaesenc %zmm20,%zmm2,%zmm2
+ vaesenc %zmm20,%zmm3,%zmm3
+
+ vpshufd $0x4e,%zmm10,%zmm10
+ vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4
+ vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5
+ vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6
+
+ vaesenc %zmm21,%zmm0,%zmm0
+ vaesenc %zmm21,%zmm1,%zmm1
+ vaesenc %zmm21,%zmm2,%zmm2
+ vaesenc %zmm21,%zmm3,%zmm3
+
+ vpternlogd $0x96,%zmm26,%zmm10,%zmm24
+ vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7
+ vpternlogd $0x96,%zmm6,%zmm5,%zmm4
+ vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25
+
+ vaesenc %zmm22,%zmm0,%zmm0
+ vaesenc %zmm22,%zmm1,%zmm1
+ vaesenc %zmm22,%zmm2,%zmm2
+ vaesenc %zmm22,%zmm3,%zmm3
+
+ vpxord %zmm7,%zmm4,%zmm10
+ vpshufd $0x4e,%zmm24,%zmm24
+ vpternlogd $0x96,%zmm25,%zmm24,%zmm10
+
+ vaesenc %zmm23,%zmm0,%zmm0
+ vaesenc %zmm23,%zmm1,%zmm1
+ vaesenc %zmm23,%zmm2,%zmm2
+ vaesenc %zmm23,%zmm3,%zmm3
+
+ vextracti32x4 $1,%zmm10,%xmm4
+ vextracti32x4 $2,%zmm10,%xmm5
+ vextracti32x4 $3,%zmm10,%xmm6
+ vpxord %xmm4,%xmm10,%xmm10
+ vpternlogd $0x96,%xmm5,%xmm6,%xmm10
+
+
+
+
+ vpxord 0(%rdi),%zmm14,%zmm4
+ vpxord 64(%rdi),%zmm14,%zmm5
+ vpxord 128(%rdi),%zmm14,%zmm6
+ vpxord 192(%rdi),%zmm14,%zmm7
+
+
+
+ vaesenclast %zmm4,%zmm0,%zmm4
+ vaesenclast %zmm5,%zmm1,%zmm5
+ vaesenclast %zmm6,%zmm2,%zmm6
+ vaesenclast %zmm7,%zmm3,%zmm7
+
+
+ vmovdqu8 %zmm4,0(%rsi)
+ vmovdqu8 %zmm5,64(%rsi)
+ vmovdqu8 %zmm6,128(%rsi)
+ vmovdqu8 %zmm7,192(%rsi)
+
+ subq $-256,%rdi
+ subq $-256,%rsi
+ addq $-256,%rdx
+ cmpq $256-1,%rdx
+ ja .Lcrypt_loop_4x__func3
+.Lghash_last_ciphertext_4x__func3:
+ vpshufb %zmm8,%zmm4,%zmm4
+ vpxord %zmm10,%zmm4,%zmm4
+ vpshufb %zmm8,%zmm5,%zmm5
+ vpshufb %zmm8,%zmm6,%zmm6
+ vpshufb %zmm8,%zmm7,%zmm7
+ vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10
+ vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24
+ vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25
+ vpxord %zmm24,%zmm10,%zmm10
+ vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm10
+ vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24
+ vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25
+ vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26
+ vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25
+ vpxord %zmm25,%zmm24,%zmm24
+ vpshufd $0x4e,%zmm10,%zmm10
+ vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4
+ vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5
+ vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6
+ vpternlogd $0x96,%zmm26,%zmm10,%zmm24
+ vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7
+ vpternlogd $0x96,%zmm6,%zmm5,%zmm4
+ vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25
+ vpxord %zmm7,%zmm4,%zmm10
+ vpshufd $0x4e,%zmm24,%zmm24
+ vpternlogd $0x96,%zmm25,%zmm24,%zmm10
+ vextracti32x4 $1,%zmm10,%xmm4
+ vextracti32x4 $2,%zmm10,%xmm5
+ vextracti32x4 $3,%zmm10,%xmm6
+ vpxord %xmm4,%xmm10,%xmm10
+ vpternlogd $0x96,%xmm5,%xmm6,%xmm10
+
+.Lcrypt_loop_4x_done__func3:
+
+ testq %rdx,%rdx
+ jz .Ldone__func3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdx,%rax
+ negq %rax
+ andq $-16,%rax
+ leaq 256(%r9,%rax,1),%r8
+ vpxor %xmm4,%xmm4,%xmm4
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+
+ cmpq $64,%rdx
+ jb .Lpartial_vec__func3
+
+.Lcrypt_loop_1x__func3:
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpxord %zmm13,%zmm0,%zmm0
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_full_vec__func3:
+ vbroadcasti32x4 (%rax),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_full_vec__func3
+ vaesenclast %zmm14,%zmm0,%zmm0
+
+
+ vmovdqu8 (%rdi),%zmm1
+ vpxord %zmm1,%zmm0,%zmm0
+ vmovdqu8 %zmm0,(%rsi)
+
+
+ vmovdqu8 (%r8),%zmm30
+ vpshufb %zmm8,%zmm0,%zmm0
+ vpxord %zmm10,%zmm0,%zmm0
+ vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
+ vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
+ vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
+ vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
+ vpxord %zmm7,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm2,%zmm1,%zmm5
+ vpxord %zmm3,%zmm6,%zmm6
+
+ vpxor %xmm10,%xmm10,%xmm10
+
+ addq $64,%r8
+ addq $64,%rdi
+ addq $64,%rsi
+ subq $64,%rdx
+ cmpq $64,%rdx
+ jae .Lcrypt_loop_1x__func3
+
+ testq %rdx,%rdx
+ jz .Lreduce__func3
+
+.Lpartial_vec__func3:
+
+
+
+
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovq %rax,%k1
+ addq $15,%rdx
+ andq $-16,%rdx
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovq %rax,%k2
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpxord %zmm13,%zmm0,%zmm0
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_partialvec__func3:
+ vbroadcasti32x4 (%rax),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_partialvec__func3
+ vaesenclast %zmm14,%zmm0,%zmm0
+
+
+ vmovdqu8 (%rdi),%zmm1{%k1}{z}
+ vpxord %zmm1,%zmm0,%zmm0
+ vmovdqu8 %zmm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 (%r8),%zmm30{%k2}{z}
+ vmovdqu8 %zmm0,%zmm1{%k1}{z}
+ vpshufb %zmm8,%zmm1,%zmm0
+ vpxord %zmm10,%zmm0,%zmm0
+ vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
+ vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
+ vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
+ vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
+ vpxord %zmm7,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm2,%zmm1,%zmm5
+ vpxord %zmm3,%zmm6,%zmm6
+
+
+.Lreduce__func3:
+
+ vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0
+ vpshufd $0x4e,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm0,%zmm4,%zmm5
+ vpclmulqdq $0x01,%zmm5,%zmm31,%zmm0
+ vpshufd $0x4e,%zmm5,%zmm5
+ vpternlogd $0x96,%zmm0,%zmm5,%zmm6
+
+ vextracti32x4 $1,%zmm6,%xmm0
+ vextracti32x4 $2,%zmm6,%xmm1
+ vextracti32x4 $3,%zmm6,%xmm2
+ vpxord %xmm0,%xmm6,%xmm10
+ vpternlogd $0x96,%xmm1,%xmm2,%xmm10
+
+
+.Ldone__func3:
+
+ vpshufb %xmm8,%xmm10,%xmm10
+ vmovdqu %xmm10,(%r12)
+
+ vzeroupper
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ ret
+
+.cfi_endproc
+.size aes_gcm_enc_update_vaes_avx10_512, . - aes_gcm_enc_update_vaes_avx10_512
+.globl aes_gcm_dec_update_vaes_avx10_512
+.hidden aes_gcm_dec_update_vaes_avx10_512
+.type aes_gcm_dec_update_vaes_avx10_512,@function
+.align 32
+aes_gcm_dec_update_vaes_avx10_512:
+.cfi_startproc
+
+_CET_ENDBR
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+
+ movq 16(%rsp),%r12
+
+ vbroadcasti32x4 .Lbswap_mask(%rip),%zmm8
+ vbroadcasti32x4 .Lgfpoly(%rip),%zmm31
+
+
+
+ vmovdqu (%r12),%xmm10
+ vpshufb %xmm8,%xmm10,%xmm10
+ vbroadcasti32x4 (%r8),%zmm12
+ vpshufb %zmm8,%zmm12,%zmm12
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti32x4 (%rcx),%zmm13
+ vbroadcasti32x4 (%r11),%zmm14
+
+
+ vpaddd .Lctr_pattern(%rip),%zmm12,%zmm12
+
+
+ vbroadcasti32x4 .Linc_4blocks(%rip),%zmm11
+
+
+
+ cmpq $256-1,%rdx
+ jbe .Lcrypt_loop_4x_done__func4
+
+
+ vmovdqu8 256-256(%r9),%zmm27
+ vmovdqu8 256-192(%r9),%zmm28
+ vmovdqu8 256-128(%r9),%zmm29
+ vmovdqu8 256-64(%r9),%zmm30
+ vbroadcasti32x4 -144(%r11),%zmm15
+ vbroadcasti32x4 -128(%r11),%zmm16
+ vbroadcasti32x4 -112(%r11),%zmm17
+ vbroadcasti32x4 -96(%r11),%zmm18
+ vbroadcasti32x4 -80(%r11),%zmm19
+ vbroadcasti32x4 -64(%r11),%zmm20
+ vbroadcasti32x4 -48(%r11),%zmm21
+ vbroadcasti32x4 -32(%r11),%zmm22
+ vbroadcasti32x4 -16(%r11),%zmm23
+.Lcrypt_loop_4x__func4:
+ vmovdqu8 0(%rdi),%zmm4
+ vmovdqu8 64(%rdi),%zmm5
+ vmovdqu8 128(%rdi),%zmm6
+ vmovdqu8 192(%rdi),%zmm7
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm1
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm2
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpshufb %zmm8,%zmm12,%zmm3
+ vpaddd %zmm11,%zmm12,%zmm12
+
+
+ vpxord %zmm13,%zmm0,%zmm0
+ vpxord %zmm13,%zmm1,%zmm1
+ vpxord %zmm13,%zmm2,%zmm2
+ vpxord %zmm13,%zmm3,%zmm3
+
+ cmpl $24,%r10d
+ jl .Laes128__func4
+ je .Laes192__func4
+
+ vbroadcasti32x4 -208(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+ vbroadcasti32x4 -192(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+.Laes192__func4:
+ vbroadcasti32x4 -176(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+ vbroadcasti32x4 -160(%r11),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ vaesenc %zmm9,%zmm1,%zmm1
+ vaesenc %zmm9,%zmm2,%zmm2
+ vaesenc %zmm9,%zmm3,%zmm3
+
+.Laes128__func4:
+ vpshufb %zmm8,%zmm4,%zmm4
+ vpxord %zmm10,%zmm4,%zmm4
+ vpshufb %zmm8,%zmm5,%zmm5
+ vpshufb %zmm8,%zmm6,%zmm6
+
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm1,%zmm1
+ vaesenc %zmm15,%zmm2,%zmm2
+ vaesenc %zmm15,%zmm3,%zmm3
+
+ vpshufb %zmm8,%zmm7,%zmm7
+ vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10
+ vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24
+ vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25
+
+ vaesenc %zmm16,%zmm0,%zmm0
+ vaesenc %zmm16,%zmm1,%zmm1
+ vaesenc %zmm16,%zmm2,%zmm2
+ vaesenc %zmm16,%zmm3,%zmm3
+
+ vpxord %zmm24,%zmm10,%zmm10
+ vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm10
+ vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24
+
+ vaesenc %zmm17,%zmm0,%zmm0
+ vaesenc %zmm17,%zmm1,%zmm1
+ vaesenc %zmm17,%zmm2,%zmm2
+ vaesenc %zmm17,%zmm3,%zmm3
+
+ vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25
+
+ vaesenc %zmm18,%zmm0,%zmm0
+ vaesenc %zmm18,%zmm1,%zmm1
+ vaesenc %zmm18,%zmm2,%zmm2
+ vaesenc %zmm18,%zmm3,%zmm3
+
+ vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25
+ vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26
+
+ vaesenc %zmm19,%zmm0,%zmm0
+ vaesenc %zmm19,%zmm1,%zmm1
+ vaesenc %zmm19,%zmm2,%zmm2
+ vaesenc %zmm19,%zmm3,%zmm3
+
+ vpternlogd $0x96,%zmm26,%zmm25,%zmm24
+ vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26
+ vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25
+ vpxord %zmm25,%zmm24,%zmm24
+
+ vaesenc %zmm20,%zmm0,%zmm0
+ vaesenc %zmm20,%zmm1,%zmm1
+ vaesenc %zmm20,%zmm2,%zmm2
+ vaesenc %zmm20,%zmm3,%zmm3
+
+ vpshufd $0x4e,%zmm10,%zmm10
+ vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4
+ vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5
+ vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6
+
+ vaesenc %zmm21,%zmm0,%zmm0
+ vaesenc %zmm21,%zmm1,%zmm1
+ vaesenc %zmm21,%zmm2,%zmm2
+ vaesenc %zmm21,%zmm3,%zmm3
+
+ vpternlogd $0x96,%zmm26,%zmm10,%zmm24
+ vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7
+ vpternlogd $0x96,%zmm6,%zmm5,%zmm4
+ vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25
+
+ vaesenc %zmm22,%zmm0,%zmm0
+ vaesenc %zmm22,%zmm1,%zmm1
+ vaesenc %zmm22,%zmm2,%zmm2
+ vaesenc %zmm22,%zmm3,%zmm3
+
+ vpxord %zmm7,%zmm4,%zmm10
+ vpshufd $0x4e,%zmm24,%zmm24
+ vpternlogd $0x96,%zmm25,%zmm24,%zmm10
+
+ vaesenc %zmm23,%zmm0,%zmm0
+ vaesenc %zmm23,%zmm1,%zmm1
+ vaesenc %zmm23,%zmm2,%zmm2
+ vaesenc %zmm23,%zmm3,%zmm3
+
+ vextracti32x4 $1,%zmm10,%xmm4
+ vextracti32x4 $2,%zmm10,%xmm5
+ vextracti32x4 $3,%zmm10,%xmm6
+ vpxord %xmm4,%xmm10,%xmm10
+ vpternlogd $0x96,%xmm5,%xmm6,%xmm10
+
+
+
+
+ vpxord 0(%rdi),%zmm14,%zmm4
+ vpxord 64(%rdi),%zmm14,%zmm5
+ vpxord 128(%rdi),%zmm14,%zmm6
+ vpxord 192(%rdi),%zmm14,%zmm7
+
+
+
+ vaesenclast %zmm4,%zmm0,%zmm4
+ vaesenclast %zmm5,%zmm1,%zmm5
+ vaesenclast %zmm6,%zmm2,%zmm6
+ vaesenclast %zmm7,%zmm3,%zmm7
+
+
+ vmovdqu8 %zmm4,0(%rsi)
+ vmovdqu8 %zmm5,64(%rsi)
+ vmovdqu8 %zmm6,128(%rsi)
+ vmovdqu8 %zmm7,192(%rsi)
+
+ subq $-256,%rdi
+ subq $-256,%rsi
+ addq $-256,%rdx
+ cmpq $256-1,%rdx
+ ja .Lcrypt_loop_4x__func4
+.Lcrypt_loop_4x_done__func4:
+
+ testq %rdx,%rdx
+ jz .Ldone__func4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdx,%rax
+ negq %rax
+ andq $-16,%rax
+ leaq 256(%r9,%rax,1),%r8
+ vpxor %xmm4,%xmm4,%xmm4
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+
+ cmpq $64,%rdx
+ jb .Lpartial_vec__func4
+
+.Lcrypt_loop_1x__func4:
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpaddd %zmm11,%zmm12,%zmm12
+ vpxord %zmm13,%zmm0,%zmm0
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_full_vec__func4:
+ vbroadcasti32x4 (%rax),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_full_vec__func4
+ vaesenclast %zmm14,%zmm0,%zmm0
+
+
+ vmovdqu8 (%rdi),%zmm1
+ vpxord %zmm1,%zmm0,%zmm0
+ vmovdqu8 %zmm0,(%rsi)
+
+
+ vmovdqu8 (%r8),%zmm30
+ vpshufb %zmm8,%zmm1,%zmm0
+ vpxord %zmm10,%zmm0,%zmm0
+ vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
+ vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
+ vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
+ vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
+ vpxord %zmm7,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm2,%zmm1,%zmm5
+ vpxord %zmm3,%zmm6,%zmm6
+
+ vpxor %xmm10,%xmm10,%xmm10
+
+ addq $64,%r8
+ addq $64,%rdi
+ addq $64,%rsi
+ subq $64,%rdx
+ cmpq $64,%rdx
+ jae .Lcrypt_loop_1x__func4
+
+ testq %rdx,%rdx
+ jz .Lreduce__func4
+
+.Lpartial_vec__func4:
+
+
+
+
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovq %rax,%k1
+ addq $15,%rdx
+ andq $-16,%rdx
+ movq $-1,%rax
+ bzhiq %rdx,%rax,%rax
+ kmovq %rax,%k2
+
+
+
+ vpshufb %zmm8,%zmm12,%zmm0
+ vpxord %zmm13,%zmm0,%zmm0
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_partialvec__func4:
+ vbroadcasti32x4 (%rax),%zmm9
+ vaesenc %zmm9,%zmm0,%zmm0
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_partialvec__func4
+ vaesenclast %zmm14,%zmm0,%zmm0
+
+
+ vmovdqu8 (%rdi),%zmm1{%k1}{z}
+ vpxord %zmm1,%zmm0,%zmm0
+ vmovdqu8 %zmm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 (%r8),%zmm30{%k2}{z}
+
+ vpshufb %zmm8,%zmm1,%zmm0
+ vpxord %zmm10,%zmm0,%zmm0
+ vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
+ vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
+ vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
+ vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
+ vpxord %zmm7,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm2,%zmm1,%zmm5
+ vpxord %zmm3,%zmm6,%zmm6
+
+
+.Lreduce__func4:
+
+ vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0
+ vpshufd $0x4e,%zmm4,%zmm4
+ vpternlogd $0x96,%zmm0,%zmm4,%zmm5
+ vpclmulqdq $0x01,%zmm5,%zmm31,%zmm0
+ vpshufd $0x4e,%zmm5,%zmm5
+ vpternlogd $0x96,%zmm0,%zmm5,%zmm6
+
+ vextracti32x4 $1,%zmm6,%xmm0
+ vextracti32x4 $2,%zmm6,%xmm1
+ vextracti32x4 $3,%zmm6,%xmm2
+ vpxord %xmm0,%xmm6,%xmm10
+ vpternlogd $0x96,%xmm1,%xmm2,%xmm10
+
+
+.Ldone__func4:
+
+ vpshufb %xmm8,%xmm10,%xmm10
+ vmovdqu %xmm10,(%r12)
+
+ vzeroupper
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ ret
+
+.cfi_endproc
+.size aes_gcm_dec_update_vaes_avx10_512, . - aes_gcm_dec_update_vaes_avx10_512
+#endif
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-win.asm b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
new file mode 100644
index 0000000..258f923
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
@@ -0,0 +1,2790 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .rdata rdata align=8
+ALIGN 64
+
+
+$L$bswap_mask:
+ DQ 0x08090a0b0c0d0e0f,0x0001020304050607
+
+
+
+
+
+
+
+
+$L$gfpoly:
+ DQ 1,0xc200000000000000
+
+
+$L$gfpoly_and_internal_carrybit:
+ DQ 1,0xc200000000000001
+
+
+
+
+
+$L$ctr_pattern:
+ DQ 0,0
+ DQ 1,0
+$L$inc_2blocks:
+ DQ 2,0
+ DQ 3,0
+$L$inc_4blocks:
+ DQ 4,0
+
+section .text code align=64
+
+global gcm_gmult_vpclmulqdq_avx10
+
+ALIGN 32
+gcm_gmult_vpclmulqdq_avx10:
+
+$L$SEH_begin_gcm_gmult_vpclmulqdq_avx10_1:
+_CET_ENDBR
+ sub rsp,24
+$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx10_2:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx10_3:
+
+$L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx10_4:
+
+ vmovdqu xmm0,XMMWORD[rcx]
+ vmovdqu xmm1,XMMWORD[$L$bswap_mask]
+ vmovdqu xmm2,XMMWORD[((256-16))+rdx]
+ vmovdqu xmm3,XMMWORD[$L$gfpoly]
+ vpshufb xmm0,xmm0,xmm1
+
+ vpclmulqdq xmm4,xmm0,xmm2,0x00
+ vpclmulqdq xmm5,xmm0,xmm2,0x01
+ vpclmulqdq xmm6,xmm0,xmm2,0x10
+ vpxord xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm3,xmm4,0x01
+ vpshufd xmm4,xmm4,0x4e
+ vpternlogd xmm5,xmm4,xmm6,0x96
+ vpclmulqdq xmm0,xmm0,xmm2,0x11
+ vpclmulqdq xmm4,xmm3,xmm5,0x01
+ vpshufd xmm5,xmm5,0x4e
+ vpternlogd xmm0,xmm5,xmm4,0x96
+
+
+ vpshufb xmm0,xmm0,xmm1
+ vmovdqu XMMWORD[rcx],xmm0
+ movdqa xmm6,XMMWORD[rsp]
+ add rsp,24
+ ret
+$L$SEH_end_gcm_gmult_vpclmulqdq_avx10_5:
+
+
+global gcm_init_vpclmulqdq_avx10
+
+ALIGN 32
+gcm_init_vpclmulqdq_avx10:
+
+
+_CET_ENDBR
+
+ lea r8,[((256-32))+rcx]
+
+
+
+ vpshufd xmm3,XMMWORD[rdx],0x4e
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vpshufd xmm0,xmm3,0xd3
+ vpsrad xmm0,xmm0,31
+ vpaddq xmm3,xmm3,xmm3
+
+ vpternlogd xmm3,xmm0,XMMWORD[$L$gfpoly_and_internal_carrybit],0x78
+
+
+ vbroadcasti32x4 ymm5,YMMWORD[$L$gfpoly]
+
+
+
+
+
+
+
+
+ vpclmulqdq xmm0,xmm3,xmm3,0x00
+ vpclmulqdq xmm1,xmm3,xmm3,0x01
+ vpclmulqdq xmm2,xmm3,xmm3,0x10
+ vpxord xmm1,xmm1,xmm2
+ vpclmulqdq xmm2,xmm5,xmm0,0x01
+ vpshufd xmm0,xmm0,0x4e
+ vpternlogd xmm1,xmm0,xmm2,0x96
+ vpclmulqdq xmm4,xmm3,xmm3,0x11
+ vpclmulqdq xmm0,xmm5,xmm1,0x01
+ vpshufd xmm1,xmm1,0x4e
+ vpternlogd xmm4,xmm1,xmm0,0x96
+
+
+
+ vinserti128 ymm3,ymm4,xmm3,1
+ vinserti128 ymm4,ymm4,xmm4,1
+
+ vmovdqu8 YMMWORD[r8],ymm3
+
+
+
+
+
+ mov eax,7
+$L$precompute_next__func1:
+ sub r8,32
+ vpclmulqdq ymm0,ymm3,ymm4,0x00
+ vpclmulqdq ymm1,ymm3,ymm4,0x01
+ vpclmulqdq ymm2,ymm3,ymm4,0x10
+ vpxord ymm1,ymm1,ymm2
+ vpclmulqdq ymm2,ymm5,ymm0,0x01
+ vpshufd ymm0,ymm0,0x4e
+ vpternlogd ymm1,ymm0,ymm2,0x96
+ vpclmulqdq ymm3,ymm3,ymm4,0x11
+ vpclmulqdq ymm0,ymm5,ymm1,0x01
+ vpshufd ymm1,ymm1,0x4e
+ vpternlogd ymm3,ymm1,ymm0,0x96
+
+ vmovdqu8 YMMWORD[r8],ymm3
+ dec eax
+ jnz NEAR $L$precompute_next__func1
+
+ vzeroupper
+ ret
+
+
+
+global gcm_ghash_vpclmulqdq_avx10_256
+
+ALIGN 32
+gcm_ghash_vpclmulqdq_avx10_256:
+
+$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1:
+_CET_ENDBR
+ sub rsp,136
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_2:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_3:
+ movdqa XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_4:
+ movdqa XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_5:
+ movdqa XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_6:
+ movdqa XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_7:
+ movdqa XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_8:
+ movdqa XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_9:
+ movdqa XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_10:
+
+$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx10_256_11:
+
+
+
+
+ vmovdqu xmm4,XMMWORD[$L$bswap_mask]
+ vmovdqu xmm10,XMMWORD[$L$gfpoly]
+
+
+ vmovdqu xmm5,XMMWORD[rcx]
+ vpshufb xmm5,xmm5,xmm4
+
+
+ cmp r9,32
+ jb NEAR $L$aad_blockbyblock__func1
+
+
+
+ vshufi64x2 ymm4,ymm4,ymm4,0
+ vshufi64x2 ymm10,ymm10,ymm10,0
+
+
+ vmovdqu8 ymm9,YMMWORD[((256-32))+rdx]
+
+ cmp r9,4*32-1
+ jbe NEAR $L$aad_loop_1x__func1
+
+
+ vmovdqu8 ymm6,YMMWORD[((256-128))+rdx]
+ vmovdqu8 ymm7,YMMWORD[((256-96))+rdx]
+ vmovdqu8 ymm8,YMMWORD[((256-64))+rdx]
+
+
+$L$aad_loop_4x__func1:
+ vmovdqu8 ymm0,YMMWORD[r8]
+ vmovdqu8 ymm1,YMMWORD[32+r8]
+ vmovdqu8 ymm2,YMMWORD[64+r8]
+ vmovdqu8 ymm3,YMMWORD[96+r8]
+ vpshufb ymm0,ymm0,ymm4
+ vpxord ymm0,ymm0,ymm5
+ vpshufb ymm1,ymm1,ymm4
+ vpshufb ymm2,ymm2,ymm4
+ vpshufb ymm3,ymm3,ymm4
+ vpclmulqdq ymm5,ymm0,ymm6,0x00
+ vpclmulqdq ymm11,ymm1,ymm7,0x00
+ vpclmulqdq ymm12,ymm2,ymm8,0x00
+ vpxord ymm5,ymm5,ymm11
+ vpclmulqdq ymm13,ymm3,ymm9,0x00
+ vpternlogd ymm5,ymm12,ymm13,0x96
+ vpclmulqdq ymm11,ymm0,ymm6,0x01
+ vpclmulqdq ymm12,ymm1,ymm7,0x01
+ vpclmulqdq ymm13,ymm2,ymm8,0x01
+ vpternlogd ymm11,ymm12,ymm13,0x96
+ vpclmulqdq ymm12,ymm3,ymm9,0x01
+ vpclmulqdq ymm13,ymm0,ymm6,0x10
+ vpternlogd ymm11,ymm12,ymm13,0x96
+ vpclmulqdq ymm12,ymm1,ymm7,0x10
+ vpclmulqdq ymm13,ymm2,ymm8,0x10
+ vpternlogd ymm11,ymm12,ymm13,0x96
+ vpclmulqdq ymm13,ymm10,ymm5,0x01
+ vpclmulqdq ymm12,ymm3,ymm9,0x10
+ vpxord ymm11,ymm11,ymm12
+ vpshufd ymm5,ymm5,0x4e
+ vpclmulqdq ymm0,ymm0,ymm6,0x11
+ vpclmulqdq ymm1,ymm1,ymm7,0x11
+ vpclmulqdq ymm2,ymm2,ymm8,0x11
+ vpternlogd ymm11,ymm5,ymm13,0x96
+ vpclmulqdq ymm3,ymm3,ymm9,0x11
+ vpternlogd ymm0,ymm1,ymm2,0x96
+ vpclmulqdq ymm12,ymm10,ymm11,0x01
+ vpxord ymm5,ymm0,ymm3
+ vpshufd ymm11,ymm11,0x4e
+ vpternlogd ymm5,ymm11,ymm12,0x96
+ vextracti32x4 xmm0,ymm5,1
+ vpxord xmm5,xmm5,xmm0
+
+ sub r8,-4*32
+ add r9,-4*32
+ cmp r9,4*32-1
+ ja NEAR $L$aad_loop_4x__func1
+
+
+ cmp r9,32
+ jb NEAR $L$aad_large_done__func1
+$L$aad_loop_1x__func1:
+ vmovdqu8 ymm0,YMMWORD[r8]
+ vpshufb ymm0,ymm0,ymm4
+ vpxord ymm5,ymm5,ymm0
+ vpclmulqdq ymm0,ymm5,ymm9,0x00
+ vpclmulqdq ymm1,ymm5,ymm9,0x01
+ vpclmulqdq ymm2,ymm5,ymm9,0x10
+ vpxord ymm1,ymm1,ymm2
+ vpclmulqdq ymm2,ymm10,ymm0,0x01
+ vpshufd ymm0,ymm0,0x4e
+ vpternlogd ymm1,ymm0,ymm2,0x96
+ vpclmulqdq ymm5,ymm5,ymm9,0x11
+ vpclmulqdq ymm0,ymm10,ymm1,0x01
+ vpshufd ymm1,ymm1,0x4e
+ vpternlogd ymm5,ymm1,ymm0,0x96
+
+ vextracti32x4 xmm0,ymm5,1
+ vpxord xmm5,xmm5,xmm0
+
+ add r8,32
+ sub r9,32
+ cmp r9,32
+ jae NEAR $L$aad_loop_1x__func1
+
+$L$aad_large_done__func1:
+
+
+ vzeroupper
+
+
+$L$aad_blockbyblock__func1:
+ test r9,r9
+ jz NEAR $L$aad_done__func1
+ vmovdqu xmm9,XMMWORD[((256-16))+rdx]
+$L$aad_loop_blockbyblock__func1:
+ vmovdqu xmm0,XMMWORD[r8]
+ vpshufb xmm0,xmm0,xmm4
+ vpxor xmm5,xmm5,xmm0
+ vpclmulqdq xmm0,xmm5,xmm9,0x00
+ vpclmulqdq xmm1,xmm5,xmm9,0x01
+ vpclmulqdq xmm2,xmm5,xmm9,0x10
+ vpxord xmm1,xmm1,xmm2
+ vpclmulqdq xmm2,xmm10,xmm0,0x01
+ vpshufd xmm0,xmm0,0x4e
+ vpternlogd xmm1,xmm0,xmm2,0x96
+ vpclmulqdq xmm5,xmm5,xmm9,0x11
+ vpclmulqdq xmm0,xmm10,xmm1,0x01
+ vpshufd xmm1,xmm1,0x4e
+ vpternlogd xmm5,xmm1,xmm0,0x96
+
+ add r8,16
+ sub r9,16
+ jnz NEAR $L$aad_loop_blockbyblock__func1
+
+$L$aad_done__func1:
+
+ vpshufb xmm5,xmm5,xmm4
+ vmovdqu XMMWORD[rcx],xmm5
+ movdqa xmm6,XMMWORD[rsp]
+ movdqa xmm7,XMMWORD[16+rsp]
+ movdqa xmm8,XMMWORD[32+rsp]
+ movdqa xmm9,XMMWORD[48+rsp]
+ movdqa xmm10,XMMWORD[64+rsp]
+ movdqa xmm11,XMMWORD[80+rsp]
+ movdqa xmm12,XMMWORD[96+rsp]
+ movdqa xmm13,XMMWORD[112+rsp]
+ add rsp,136
+ ret
+$L$SEH_end_gcm_ghash_vpclmulqdq_avx10_256_12:
+
+
+global aes_gcm_enc_update_vaes_avx10_256
+
+ALIGN 32
+aes_gcm_enc_update_vaes_avx10_256:
+
+$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1:
+_CET_ENDBR
+ push rsi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_2:
+ push rdi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_3:
+ push r12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_4:
+
+ mov rsi,QWORD[64+rsp]
+ mov rdi,QWORD[72+rsp]
+ mov r12,QWORD[80+rsp]
+ sub rsp,160
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_5:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_6:
+ movdqa XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_7:
+ movdqa XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_8:
+ movdqa XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_9:
+ movdqa XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_10:
+ movdqa XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_11:
+ movdqa XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_12:
+ movdqa XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_13:
+ movdqa XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_14:
+ movdqa XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_15:
+
+$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx10_256_16:
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN BORINGSSL_function_hit
+ mov BYTE[((BORINGSSL_function_hit+6))],1
+%endif
+
+ vbroadcasti32x4 ymm8,YMMWORD[$L$bswap_mask]
+ vbroadcasti32x4 ymm31,YMMWORD[$L$gfpoly]
+
+
+
+ vmovdqu xmm10,XMMWORD[r12]
+ vpshufb xmm10,xmm10,xmm8
+ vbroadcasti32x4 ymm12,YMMWORD[rsi]
+ vpshufb ymm12,ymm12,ymm8
+
+
+
+ mov r10d,DWORD[240+r9]
+ lea r10d,[((-20))+r10*4]
+
+
+
+
+ lea r11,[96+r10*4+r9]
+ vbroadcasti32x4 ymm13,YMMWORD[r9]
+ vbroadcasti32x4 ymm14,YMMWORD[r11]
+
+
+ vpaddd ymm12,ymm12,YMMWORD[$L$ctr_pattern]
+
+
+ vbroadcasti32x4 ymm11,YMMWORD[$L$inc_2blocks]
+
+
+
+ cmp r8,4*32-1
+ jbe NEAR $L$crypt_loop_4x_done__func1
+
+
+ vmovdqu8 ymm27,YMMWORD[((256-128))+rdi]
+ vmovdqu8 ymm28,YMMWORD[((256-96))+rdi]
+ vmovdqu8 ymm29,YMMWORD[((256-64))+rdi]
+ vmovdqu8 ymm30,YMMWORD[((256-32))+rdi]
+
+
+
+
+ vpshufb ymm0,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+ vpshufb ymm1,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+ vpshufb ymm2,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+ vpshufb ymm3,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+
+
+ vpxord ymm0,ymm0,ymm13
+ vpxord ymm1,ymm1,ymm13
+ vpxord ymm2,ymm2,ymm13
+ vpxord ymm3,ymm3,ymm13
+
+ lea rax,[16+r9]
+$L$vaesenc_loop_first_4_vecs__func1:
+ vbroadcasti32x4 ymm9,YMMWORD[rax]
+ vaesenc ymm0,ymm0,ymm9
+ vaesenc ymm1,ymm1,ymm9
+ vaesenc ymm2,ymm2,ymm9
+ vaesenc ymm3,ymm3,ymm9
+
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_first_4_vecs__func1
+
+
+
+ vpxord ymm4,ymm14,YMMWORD[rcx]
+ vpxord ymm5,ymm14,YMMWORD[32+rcx]
+ vpxord ymm6,ymm14,YMMWORD[64+rcx]
+ vpxord ymm7,ymm14,YMMWORD[96+rcx]
+
+
+
+ vaesenclast ymm4,ymm0,ymm4
+ vaesenclast ymm5,ymm1,ymm5
+ vaesenclast ymm6,ymm2,ymm6
+ vaesenclast ymm7,ymm3,ymm7
+
+
+ vmovdqu8 YMMWORD[rdx],ymm4
+ vmovdqu8 YMMWORD[32+rdx],ymm5
+ vmovdqu8 YMMWORD[64+rdx],ymm6
+ vmovdqu8 YMMWORD[96+rdx],ymm7
+
+ sub rcx,-4*32
+ sub rdx,-4*32
+ add r8,-4*32
+ cmp r8,4*32-1
+ jbe NEAR $L$ghash_last_ciphertext_4x__func1
+ vbroadcasti32x4 ymm15,YMMWORD[((-144))+r11]
+ vbroadcasti32x4 ymm16,YMMWORD[((-128))+r11]
+ vbroadcasti32x4 ymm17,YMMWORD[((-112))+r11]
+ vbroadcasti32x4 ymm18,YMMWORD[((-96))+r11]
+ vbroadcasti32x4 ymm19,YMMWORD[((-80))+r11]
+ vbroadcasti32x4 ymm20,YMMWORD[((-64))+r11]
+ vbroadcasti32x4 ymm21,YMMWORD[((-48))+r11]
+ vbroadcasti32x4 ymm22,YMMWORD[((-32))+r11]
+ vbroadcasti32x4 ymm23,YMMWORD[((-16))+r11]
+$L$crypt_loop_4x__func1:
+
+
+
+ vpshufb ymm0,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+ vpshufb ymm1,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+ vpshufb ymm2,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+ vpshufb ymm3,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+
+
+ vpxord ymm0,ymm0,ymm13
+ vpxord ymm1,ymm1,ymm13
+ vpxord ymm2,ymm2,ymm13
+ vpxord ymm3,ymm3,ymm13
+
+ cmp r10d,24
+ jl NEAR $L$aes128__func1
+ je NEAR $L$aes192__func1
+
+ vbroadcasti32x4 ymm9,YMMWORD[((-208))+r11]
+ vaesenc ymm0,ymm0,ymm9
+ vaesenc ymm1,ymm1,ymm9
+ vaesenc ymm2,ymm2,ymm9
+ vaesenc ymm3,ymm3,ymm9
+
+ vbroadcasti32x4 ymm9,YMMWORD[((-192))+r11]
+ vaesenc ymm0,ymm0,ymm9
+ vaesenc ymm1,ymm1,ymm9
+ vaesenc ymm2,ymm2,ymm9
+ vaesenc ymm3,ymm3,ymm9
+
+$L$aes192__func1:
+ vbroadcasti32x4 ymm9,YMMWORD[((-176))+r11]
+ vaesenc ymm0,ymm0,ymm9
+ vaesenc ymm1,ymm1,ymm9
+ vaesenc ymm2,ymm2,ymm9
+ vaesenc ymm3,ymm3,ymm9
+
+ vbroadcasti32x4 ymm9,YMMWORD[((-160))+r11]
+ vaesenc ymm0,ymm0,ymm9
+ vaesenc ymm1,ymm1,ymm9
+ vaesenc ymm2,ymm2,ymm9
+ vaesenc ymm3,ymm3,ymm9
+
+$L$aes128__func1:
+ vpshufb ymm4,ymm4,ymm8
+ vpxord ymm4,ymm4,ymm10
+ vpshufb ymm5,ymm5,ymm8
+ vpshufb ymm6,ymm6,ymm8
+
+ vaesenc ymm0,ymm0,ymm15
+ vaesenc ymm1,ymm1,ymm15
+ vaesenc ymm2,ymm2,ymm15
+ vaesenc ymm3,ymm3,ymm15
+
+ vpshufb ymm7,ymm7,ymm8
+ vpclmulqdq ymm10,ymm4,ymm27,0x00
+ vpclmulqdq ymm24,ymm5,ymm28,0x00
+ vpclmulqdq ymm25,ymm6,ymm29,0x00
+
+ vaesenc ymm0,ymm0,ymm16
+ vaesenc ymm1,ymm1,ymm16
+ vaesenc ymm2,ymm2,ymm16
+ vaesenc ymm3,ymm3,ymm16
+
+ vpxord ymm10,ymm10,ymm24
+ vpclmulqdq ymm26,ymm7,ymm30,0x00
+ vpternlogd ymm10,ymm25,ymm26,0x96
+ vpclmulqdq ymm24,ymm4,ymm27,0x01
+
+ vaesenc ymm0,ymm0,ymm17
+ vaesenc ymm1,ymm1,ymm17
+ vaesenc ymm2,ymm2,ymm17
+ vaesenc ymm3,ymm3,ymm17
+
+ vpclmulqdq ymm25,ymm5,ymm28,0x01
+ vpclmulqdq ymm26,ymm6,ymm29,0x01
+ vpternlogd ymm24,ymm25,ymm26,0x96
+ vpclmulqdq ymm25,ymm7,ymm30,0x01
+
+ vaesenc ymm0,ymm0,ymm18
+ vaesenc ymm1,ymm1,ymm18
+ vaesenc ymm2,ymm2,ymm18
+ vaesenc ymm3,ymm3,ymm18
+
+ vpclmulqdq ymm26,ymm4,ymm27,0x10
+ vpternlogd ymm24,ymm25,ymm26,0x96
+ vpclmulqdq ymm25,ymm5,ymm28,0x10
+ vpclmulqdq ymm26,ymm6,ymm29,0x10
+
+ vaesenc ymm0,ymm0,ymm19
+ vaesenc ymm1,ymm1,ymm19
+ vaesenc ymm2,ymm2,ymm19
+ vaesenc ymm3,ymm3,ymm19
+
+ vpternlogd ymm24,ymm25,ymm26,0x96
+ vpclmulqdq ymm26,ymm31,ymm10,0x01
+ vpclmulqdq ymm25,ymm7,ymm30,0x10
+ vpxord ymm24,ymm24,ymm25
+
+ vaesenc ymm0,ymm0,ymm20
+ vaesenc ymm1,ymm1,ymm20
+ vaesenc ymm2,ymm2,ymm20
+ vaesenc ymm3,ymm3,ymm20
+
+ vpshufd ymm10,ymm10,0x4e
+ vpclmulqdq ymm4,ymm4,ymm27,0x11
+ vpclmulqdq ymm5,ymm5,ymm28,0x11
+ vpclmulqdq ymm6,ymm6,ymm29,0x11
+
+ vaesenc ymm0,ymm0,ymm21
+ vaesenc ymm1,ymm1,ymm21
+ vaesenc ymm2,ymm2,ymm21
+ vaesenc ymm3,ymm3,ymm21
+
+ vpternlogd ymm24,ymm10,ymm26,0x96
+ vpclmulqdq ymm7,ymm7,ymm30,0x11
+ vpternlogd ymm4,ymm5,ymm6,0x96
+ vpclmulqdq ymm25,ymm31,ymm24,0x01
+
+ vaesenc ymm0,ymm0,ymm22
+ vaesenc ymm1,ymm1,ymm22
+ vaesenc ymm2,ymm2,ymm22
+ vaesenc ymm3,ymm3,ymm22
+
+ vpxord ymm10,ymm4,ymm7
+ vpshufd ymm24,ymm24,0x4e
+ vpternlogd ymm10,ymm24,ymm25,0x96
+
+ vaesenc ymm0,ymm0,ymm23
+ vaesenc ymm1,ymm1,ymm23
+ vaesenc ymm2,ymm2,ymm23
+ vaesenc ymm3,ymm3,ymm23
+
+ vextracti32x4 xmm4,ymm10,1
+ vpxord xmm10,xmm10,xmm4
+
+
+
+
+ vpxord ymm4,ymm14,YMMWORD[rcx]
+ vpxord ymm5,ymm14,YMMWORD[32+rcx]
+ vpxord ymm6,ymm14,YMMWORD[64+rcx]
+ vpxord ymm7,ymm14,YMMWORD[96+rcx]
+
+
+
+ vaesenclast ymm4,ymm0,ymm4
+ vaesenclast ymm5,ymm1,ymm5
+ vaesenclast ymm6,ymm2,ymm6
+ vaesenclast ymm7,ymm3,ymm7
+
+
+ vmovdqu8 YMMWORD[rdx],ymm4
+ vmovdqu8 YMMWORD[32+rdx],ymm5
+ vmovdqu8 YMMWORD[64+rdx],ymm6
+ vmovdqu8 YMMWORD[96+rdx],ymm7
+
+ sub rcx,-4*32
+ sub rdx,-4*32
+ add r8,-4*32
+ cmp r8,4*32-1
+ ja NEAR $L$crypt_loop_4x__func1
+$L$ghash_last_ciphertext_4x__func1:
+ vpshufb ymm4,ymm4,ymm8
+ vpxord ymm4,ymm4,ymm10
+ vpshufb ymm5,ymm5,ymm8
+ vpshufb ymm6,ymm6,ymm8
+ vpshufb ymm7,ymm7,ymm8
+ vpclmulqdq ymm10,ymm4,ymm27,0x00
+ vpclmulqdq ymm24,ymm5,ymm28,0x00
+ vpclmulqdq ymm25,ymm6,ymm29,0x00
+ vpxord ymm10,ymm10,ymm24
+ vpclmulqdq ymm26,ymm7,ymm30,0x00
+ vpternlogd ymm10,ymm25,ymm26,0x96
+ vpclmulqdq ymm24,ymm4,ymm27,0x01
+ vpclmulqdq ymm25,ymm5,ymm28,0x01
+ vpclmulqdq ymm26,ymm6,ymm29,0x01
+ vpternlogd ymm24,ymm25,ymm26,0x96
+ vpclmulqdq ymm25,ymm7,ymm30,0x01
+ vpclmulqdq ymm26,ymm4,ymm27,0x10
+ vpternlogd ymm24,ymm25,ymm26,0x96
+ vpclmulqdq ymm25,ymm5,ymm28,0x10
+ vpclmulqdq ymm26,ymm6,ymm29,0x10
+ vpternlogd ymm24,ymm25,ymm26,0x96
+ vpclmulqdq ymm26,ymm31,ymm10,0x01
+ vpclmulqdq ymm25,ymm7,ymm30,0x10
+ vpxord ymm24,ymm24,ymm25
+ vpshufd ymm10,ymm10,0x4e
+ vpclmulqdq ymm4,ymm4,ymm27,0x11
+ vpclmulqdq ymm5,ymm5,ymm28,0x11
+ vpclmulqdq ymm6,ymm6,ymm29,0x11
+ vpternlogd ymm24,ymm10,ymm26,0x96
+ vpclmulqdq ymm7,ymm7,ymm30,0x11
+ vpternlogd ymm4,ymm5,ymm6,0x96
+ vpclmulqdq ymm25,ymm31,ymm24,0x01
+ vpxord ymm10,ymm4,ymm7
+ vpshufd ymm24,ymm24,0x4e
+ vpternlogd ymm10,ymm24,ymm25,0x96
+ vextracti32x4 xmm4,ymm10,1
+ vpxord xmm10,xmm10,xmm4
+
+$L$crypt_loop_4x_done__func1:
+
+ test r8,r8
+ jz NEAR $L$done__func1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ mov rax,r8
+ neg rax
+ and rax,-16
+ lea rsi,[256+rax*1+rdi]
+ vpxor xmm4,xmm4,xmm4
+ vpxor xmm5,xmm5,xmm5
+ vpxor xmm6,xmm6,xmm6
+
+ cmp r8,32
+ jb NEAR $L$partial_vec__func1
+
+$L$crypt_loop_1x__func1:
+
+
+
+ vpshufb ymm0,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+ vpxord ymm0,ymm0,ymm13
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_full_vec__func1:
+ vbroadcasti32x4 ymm9,YMMWORD[rax]
+ vaesenc ymm0,ymm0,ymm9
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_full_vec__func1
+ vaesenclast ymm0,ymm0,ymm14
+
+
+ vmovdqu8 ymm1,YMMWORD[rcx]
+ vpxord ymm0,ymm0,ymm1
+ vmovdqu8 YMMWORD[rdx],ymm0
+
+
+ vmovdqu8 ymm30,YMMWORD[rsi]
+ vpshufb ymm0,ymm0,ymm8
+ vpxord ymm0,ymm0,ymm10
+ vpclmulqdq ymm7,ymm0,ymm30,0x00
+ vpclmulqdq ymm1,ymm0,ymm30,0x01
+ vpclmulqdq ymm2,ymm0,ymm30,0x10
+ vpclmulqdq ymm3,ymm0,ymm30,0x11
+ vpxord ymm4,ymm4,ymm7
+ vpternlogd ymm5,ymm1,ymm2,0x96
+ vpxord ymm6,ymm6,ymm3
+
+ vpxor xmm10,xmm10,xmm10
+
+ add rsi,32
+ add rcx,32
+ add rdx,32
+ sub r8,32
+ cmp r8,32
+ jae NEAR $L$crypt_loop_1x__func1
+
+ test r8,r8
+ jz NEAR $L$reduce__func1
+
+$L$partial_vec__func1:
+
+
+
+
+ mov rax,-1
+ bzhi rax,rax,r8
+ kmovd k1,eax
+ add r8,15
+ and r8,-16
+ mov rax,-1
+ bzhi rax,rax,r8
+ kmovd k2,eax
+
+
+
+ vpshufb ymm0,ymm12,ymm8
+ vpxord ymm0,ymm0,ymm13
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_partialvec__func1:
+ vbroadcasti32x4 ymm9,YMMWORD[rax]
+ vaesenc ymm0,ymm0,ymm9
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_partialvec__func1
+ vaesenclast ymm0,ymm0,ymm14
+
+
+ vmovdqu8 ymm1{k1}{z},[rcx]
+ vpxord ymm0,ymm0,ymm1
+ vmovdqu8 YMMWORD[rdx]{k1},ymm0
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 ymm30{k2}{z},[rsi]
+ vmovdqu8 ymm1{k1}{z},ymm0
+ vpshufb ymm0,ymm1,ymm8
+ vpxord ymm0,ymm0,ymm10
+ vpclmulqdq ymm7,ymm0,ymm30,0x00
+ vpclmulqdq ymm1,ymm0,ymm30,0x01
+ vpclmulqdq ymm2,ymm0,ymm30,0x10
+ vpclmulqdq ymm3,ymm0,ymm30,0x11
+ vpxord ymm4,ymm4,ymm7
+ vpternlogd ymm5,ymm1,ymm2,0x96
+ vpxord ymm6,ymm6,ymm3
+
+
+$L$reduce__func1:
+
+ vpclmulqdq ymm0,ymm31,ymm4,0x01
+ vpshufd ymm4,ymm4,0x4e
+ vpternlogd ymm5,ymm4,ymm0,0x96
+ vpclmulqdq ymm0,ymm31,ymm5,0x01
+ vpshufd ymm5,ymm5,0x4e
+ vpternlogd ymm6,ymm5,ymm0,0x96
+
+ vextracti32x4 xmm0,ymm6,1
+ vpxord xmm10,xmm6,xmm0
+
+
+$L$done__func1:
+
+ vpshufb xmm10,xmm10,xmm8
+ vmovdqu XMMWORD[r12],xmm10
+
+ vzeroupper
+ movdqa xmm6,XMMWORD[rsp]
+ movdqa xmm7,XMMWORD[16+rsp]
+ movdqa xmm8,XMMWORD[32+rsp]
+ movdqa xmm9,XMMWORD[48+rsp]
+ movdqa xmm10,XMMWORD[64+rsp]
+ movdqa xmm11,XMMWORD[80+rsp]
+ movdqa xmm12,XMMWORD[96+rsp]
+ movdqa xmm13,XMMWORD[112+rsp]
+ movdqa xmm14,XMMWORD[128+rsp]
+ movdqa xmm15,XMMWORD[144+rsp]
+ add rsp,160
+ pop r12
+ pop rdi
+ pop rsi
+ ret
+$L$SEH_end_aes_gcm_enc_update_vaes_avx10_256_17:
+
+
+global aes_gcm_dec_update_vaes_avx10_256
+
+ALIGN 32
+aes_gcm_dec_update_vaes_avx10_256:
+
+$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1:
+_CET_ENDBR
+ push rsi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_2:
+ push rdi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_3:
+ push r12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_4:
+
+ mov rsi,QWORD[64+rsp]
+ mov rdi,QWORD[72+rsp]
+ mov r12,QWORD[80+rsp]
+ sub rsp,160
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_5:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_6:
+ movdqa XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_7:
+ movdqa XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_8:
+ movdqa XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_9:
+ movdqa XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_10:
+ movdqa XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_11:
+ movdqa XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_12:
+ movdqa XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_13:
+ movdqa XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_14:
+ movdqa XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_15:
+
+$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx10_256_16:
+
+ vbroadcasti32x4 ymm8,YMMWORD[$L$bswap_mask]
+ vbroadcasti32x4 ymm31,YMMWORD[$L$gfpoly]
+
+
+
+ vmovdqu xmm10,XMMWORD[r12]
+ vpshufb xmm10,xmm10,xmm8
+ vbroadcasti32x4 ymm12,YMMWORD[rsi]
+ vpshufb ymm12,ymm12,ymm8
+
+
+
+ mov r10d,DWORD[240+r9]
+ lea r10d,[((-20))+r10*4]
+
+
+
+
+ lea r11,[96+r10*4+r9]
+ vbroadcasti32x4 ymm13,YMMWORD[r9]
+ vbroadcasti32x4 ymm14,YMMWORD[r11]
+
+
+ vpaddd ymm12,ymm12,YMMWORD[$L$ctr_pattern]
+
+
+ vbroadcasti32x4 ymm11,YMMWORD[$L$inc_2blocks]
+
+
+
+ cmp r8,4*32-1
+ jbe NEAR $L$crypt_loop_4x_done__func2
+
+
+ vmovdqu8 ymm27,YMMWORD[((256-128))+rdi]
+ vmovdqu8 ymm28,YMMWORD[((256-96))+rdi]
+ vmovdqu8 ymm29,YMMWORD[((256-64))+rdi]
+ vmovdqu8 ymm30,YMMWORD[((256-32))+rdi]
+ vbroadcasti32x4 ymm15,YMMWORD[((-144))+r11]
+ vbroadcasti32x4 ymm16,YMMWORD[((-128))+r11]
+ vbroadcasti32x4 ymm17,YMMWORD[((-112))+r11]
+ vbroadcasti32x4 ymm18,YMMWORD[((-96))+r11]
+ vbroadcasti32x4 ymm19,YMMWORD[((-80))+r11]
+ vbroadcasti32x4 ymm20,YMMWORD[((-64))+r11]
+ vbroadcasti32x4 ymm21,YMMWORD[((-48))+r11]
+ vbroadcasti32x4 ymm22,YMMWORD[((-32))+r11]
+ vbroadcasti32x4 ymm23,YMMWORD[((-16))+r11]
+$L$crypt_loop_4x__func2:
+ vmovdqu8 ymm4,YMMWORD[rcx]
+ vmovdqu8 ymm5,YMMWORD[32+rcx]
+ vmovdqu8 ymm6,YMMWORD[64+rcx]
+ vmovdqu8 ymm7,YMMWORD[96+rcx]
+
+
+
+ vpshufb ymm0,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+ vpshufb ymm1,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+ vpshufb ymm2,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+ vpshufb ymm3,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+
+
+ vpxord ymm0,ymm0,ymm13
+ vpxord ymm1,ymm1,ymm13
+ vpxord ymm2,ymm2,ymm13
+ vpxord ymm3,ymm3,ymm13
+
+ cmp r10d,24
+ jl NEAR $L$aes128__func2
+ je NEAR $L$aes192__func2
+
+ vbroadcasti32x4 ymm9,YMMWORD[((-208))+r11]
+ vaesenc ymm0,ymm0,ymm9
+ vaesenc ymm1,ymm1,ymm9
+ vaesenc ymm2,ymm2,ymm9
+ vaesenc ymm3,ymm3,ymm9
+
+ vbroadcasti32x4 ymm9,YMMWORD[((-192))+r11]
+ vaesenc ymm0,ymm0,ymm9
+ vaesenc ymm1,ymm1,ymm9
+ vaesenc ymm2,ymm2,ymm9
+ vaesenc ymm3,ymm3,ymm9
+
+$L$aes192__func2:
+ vbroadcasti32x4 ymm9,YMMWORD[((-176))+r11]
+ vaesenc ymm0,ymm0,ymm9
+ vaesenc ymm1,ymm1,ymm9
+ vaesenc ymm2,ymm2,ymm9
+ vaesenc ymm3,ymm3,ymm9
+
+ vbroadcasti32x4 ymm9,YMMWORD[((-160))+r11]
+ vaesenc ymm0,ymm0,ymm9
+ vaesenc ymm1,ymm1,ymm9
+ vaesenc ymm2,ymm2,ymm9
+ vaesenc ymm3,ymm3,ymm9
+
+$L$aes128__func2:
+ vpshufb ymm4,ymm4,ymm8
+ vpxord ymm4,ymm4,ymm10
+ vpshufb ymm5,ymm5,ymm8
+ vpshufb ymm6,ymm6,ymm8
+
+ vaesenc ymm0,ymm0,ymm15
+ vaesenc ymm1,ymm1,ymm15
+ vaesenc ymm2,ymm2,ymm15
+ vaesenc ymm3,ymm3,ymm15
+
+ vpshufb ymm7,ymm7,ymm8
+ vpclmulqdq ymm10,ymm4,ymm27,0x00
+ vpclmulqdq ymm24,ymm5,ymm28,0x00
+ vpclmulqdq ymm25,ymm6,ymm29,0x00
+
+ vaesenc ymm0,ymm0,ymm16
+ vaesenc ymm1,ymm1,ymm16
+ vaesenc ymm2,ymm2,ymm16
+ vaesenc ymm3,ymm3,ymm16
+
+ vpxord ymm10,ymm10,ymm24
+ vpclmulqdq ymm26,ymm7,ymm30,0x00
+ vpternlogd ymm10,ymm25,ymm26,0x96
+ vpclmulqdq ymm24,ymm4,ymm27,0x01
+
+ vaesenc ymm0,ymm0,ymm17
+ vaesenc ymm1,ymm1,ymm17
+ vaesenc ymm2,ymm2,ymm17
+ vaesenc ymm3,ymm3,ymm17
+
+ vpclmulqdq ymm25,ymm5,ymm28,0x01
+ vpclmulqdq ymm26,ymm6,ymm29,0x01
+ vpternlogd ymm24,ymm25,ymm26,0x96
+ vpclmulqdq ymm25,ymm7,ymm30,0x01
+
+ vaesenc ymm0,ymm0,ymm18
+ vaesenc ymm1,ymm1,ymm18
+ vaesenc ymm2,ymm2,ymm18
+ vaesenc ymm3,ymm3,ymm18
+
+ vpclmulqdq ymm26,ymm4,ymm27,0x10
+ vpternlogd ymm24,ymm25,ymm26,0x96
+ vpclmulqdq ymm25,ymm5,ymm28,0x10
+ vpclmulqdq ymm26,ymm6,ymm29,0x10
+
+ vaesenc ymm0,ymm0,ymm19
+ vaesenc ymm1,ymm1,ymm19
+ vaesenc ymm2,ymm2,ymm19
+ vaesenc ymm3,ymm3,ymm19
+
+ vpternlogd ymm24,ymm25,ymm26,0x96
+ vpclmulqdq ymm26,ymm31,ymm10,0x01
+ vpclmulqdq ymm25,ymm7,ymm30,0x10
+ vpxord ymm24,ymm24,ymm25
+
+ vaesenc ymm0,ymm0,ymm20
+ vaesenc ymm1,ymm1,ymm20
+ vaesenc ymm2,ymm2,ymm20
+ vaesenc ymm3,ymm3,ymm20
+
+ vpshufd ymm10,ymm10,0x4e
+ vpclmulqdq ymm4,ymm4,ymm27,0x11
+ vpclmulqdq ymm5,ymm5,ymm28,0x11
+ vpclmulqdq ymm6,ymm6,ymm29,0x11
+
+ vaesenc ymm0,ymm0,ymm21
+ vaesenc ymm1,ymm1,ymm21
+ vaesenc ymm2,ymm2,ymm21
+ vaesenc ymm3,ymm3,ymm21
+
+ vpternlogd ymm24,ymm10,ymm26,0x96
+ vpclmulqdq ymm7,ymm7,ymm30,0x11
+ vpternlogd ymm4,ymm5,ymm6,0x96
+ vpclmulqdq ymm25,ymm31,ymm24,0x01
+
+ vaesenc ymm0,ymm0,ymm22
+ vaesenc ymm1,ymm1,ymm22
+ vaesenc ymm2,ymm2,ymm22
+ vaesenc ymm3,ymm3,ymm22
+
+ vpxord ymm10,ymm4,ymm7
+ vpshufd ymm24,ymm24,0x4e
+ vpternlogd ymm10,ymm24,ymm25,0x96
+
+ vaesenc ymm0,ymm0,ymm23
+ vaesenc ymm1,ymm1,ymm23
+ vaesenc ymm2,ymm2,ymm23
+ vaesenc ymm3,ymm3,ymm23
+
+ vextracti32x4 xmm4,ymm10,1
+ vpxord xmm10,xmm10,xmm4
+
+
+
+
+ vpxord ymm4,ymm14,YMMWORD[rcx]
+ vpxord ymm5,ymm14,YMMWORD[32+rcx]
+ vpxord ymm6,ymm14,YMMWORD[64+rcx]
+ vpxord ymm7,ymm14,YMMWORD[96+rcx]
+
+
+
+ vaesenclast ymm4,ymm0,ymm4
+ vaesenclast ymm5,ymm1,ymm5
+ vaesenclast ymm6,ymm2,ymm6
+ vaesenclast ymm7,ymm3,ymm7
+
+
+ vmovdqu8 YMMWORD[rdx],ymm4
+ vmovdqu8 YMMWORD[32+rdx],ymm5
+ vmovdqu8 YMMWORD[64+rdx],ymm6
+ vmovdqu8 YMMWORD[96+rdx],ymm7
+
+ sub rcx,-4*32
+ sub rdx,-4*32
+ add r8,-4*32
+ cmp r8,4*32-1
+ ja NEAR $L$crypt_loop_4x__func2
+$L$crypt_loop_4x_done__func2:
+
+ test r8,r8
+ jz NEAR $L$done__func2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ mov rax,r8
+ neg rax
+ and rax,-16
+ lea rsi,[256+rax*1+rdi]
+ vpxor xmm4,xmm4,xmm4
+ vpxor xmm5,xmm5,xmm5
+ vpxor xmm6,xmm6,xmm6
+
+ cmp r8,32
+ jb NEAR $L$partial_vec__func2
+
+$L$crypt_loop_1x__func2:
+
+
+
+ vpshufb ymm0,ymm12,ymm8
+ vpaddd ymm12,ymm12,ymm11
+ vpxord ymm0,ymm0,ymm13
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_full_vec__func2:
+ vbroadcasti32x4 ymm9,YMMWORD[rax]
+ vaesenc ymm0,ymm0,ymm9
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_full_vec__func2
+ vaesenclast ymm0,ymm0,ymm14
+
+
+ vmovdqu8 ymm1,YMMWORD[rcx]
+ vpxord ymm0,ymm0,ymm1
+ vmovdqu8 YMMWORD[rdx],ymm0
+
+
+ vmovdqu8 ymm30,YMMWORD[rsi]
+ vpshufb ymm0,ymm1,ymm8
+ vpxord ymm0,ymm0,ymm10
+ vpclmulqdq ymm7,ymm0,ymm30,0x00
+ vpclmulqdq ymm1,ymm0,ymm30,0x01
+ vpclmulqdq ymm2,ymm0,ymm30,0x10
+ vpclmulqdq ymm3,ymm0,ymm30,0x11
+ vpxord ymm4,ymm4,ymm7
+ vpternlogd ymm5,ymm1,ymm2,0x96
+ vpxord ymm6,ymm6,ymm3
+
+ vpxor xmm10,xmm10,xmm10
+
+ add rsi,32
+ add rcx,32
+ add rdx,32
+ sub r8,32
+ cmp r8,32
+ jae NEAR $L$crypt_loop_1x__func2
+
+ test r8,r8
+ jz NEAR $L$reduce__func2
+
+$L$partial_vec__func2:
+
+
+
+
+ mov rax,-1
+ bzhi rax,rax,r8
+ kmovd k1,eax
+ add r8,15
+ and r8,-16
+ mov rax,-1
+ bzhi rax,rax,r8
+ kmovd k2,eax
+
+
+
+ vpshufb ymm0,ymm12,ymm8
+ vpxord ymm0,ymm0,ymm13
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_partialvec__func2:
+ vbroadcasti32x4 ymm9,YMMWORD[rax]
+ vaesenc ymm0,ymm0,ymm9
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_partialvec__func2
+ vaesenclast ymm0,ymm0,ymm14
+
+
+ vmovdqu8 ymm1{k1}{z},[rcx]
+ vpxord ymm0,ymm0,ymm1
+ vmovdqu8 YMMWORD[rdx]{k1},ymm0
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 ymm30{k2}{z},[rsi]
+
+ vpshufb ymm0,ymm1,ymm8
+ vpxord ymm0,ymm0,ymm10
+ vpclmulqdq ymm7,ymm0,ymm30,0x00
+ vpclmulqdq ymm1,ymm0,ymm30,0x01
+ vpclmulqdq ymm2,ymm0,ymm30,0x10
+ vpclmulqdq ymm3,ymm0,ymm30,0x11
+ vpxord ymm4,ymm4,ymm7
+ vpternlogd ymm5,ymm1,ymm2,0x96
+ vpxord ymm6,ymm6,ymm3
+
+
+$L$reduce__func2:
+
+ vpclmulqdq ymm0,ymm31,ymm4,0x01
+ vpshufd ymm4,ymm4,0x4e
+ vpternlogd ymm5,ymm4,ymm0,0x96
+ vpclmulqdq ymm0,ymm31,ymm5,0x01
+ vpshufd ymm5,ymm5,0x4e
+ vpternlogd ymm6,ymm5,ymm0,0x96
+
+ vextracti32x4 xmm0,ymm6,1
+ vpxord xmm10,xmm6,xmm0
+
+
+$L$done__func2:
+
+ vpshufb xmm10,xmm10,xmm8
+ vmovdqu XMMWORD[r12],xmm10
+
+ vzeroupper
+ movdqa xmm6,XMMWORD[rsp]
+ movdqa xmm7,XMMWORD[16+rsp]
+ movdqa xmm8,XMMWORD[32+rsp]
+ movdqa xmm9,XMMWORD[48+rsp]
+ movdqa xmm10,XMMWORD[64+rsp]
+ movdqa xmm11,XMMWORD[80+rsp]
+ movdqa xmm12,XMMWORD[96+rsp]
+ movdqa xmm13,XMMWORD[112+rsp]
+ movdqa xmm14,XMMWORD[128+rsp]
+ movdqa xmm15,XMMWORD[144+rsp]
+ add rsp,160
+ pop r12
+ pop rdi
+ pop rsi
+ ret
+$L$SEH_end_aes_gcm_dec_update_vaes_avx10_256_17:
+
+
+global gcm_ghash_vpclmulqdq_avx10_512
+
+ALIGN 32
+gcm_ghash_vpclmulqdq_avx10_512:
+
+$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1:
+_CET_ENDBR
+ sub rsp,136
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_2:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_3:
+ movdqa XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_4:
+ movdqa XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_5:
+ movdqa XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_6:
+ movdqa XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_7:
+ movdqa XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_8:
+ movdqa XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_9:
+ movdqa XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_10:
+
+$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx10_512_11:
+
+
+
+
+ vmovdqu xmm4,XMMWORD[$L$bswap_mask]
+ vmovdqu xmm10,XMMWORD[$L$gfpoly]
+
+
+ vmovdqu xmm5,XMMWORD[rcx]
+ vpshufb xmm5,xmm5,xmm4
+
+
+ cmp r9,64
+ jb NEAR $L$aad_blockbyblock__func2
+
+
+
+ vshufi64x2 zmm4,zmm4,zmm4,0
+ vshufi64x2 zmm10,zmm10,zmm10,0
+
+
+ vmovdqu8 zmm9,ZMMWORD[((256-64))+rdx]
+
+ cmp r9,4*64-1
+ jbe NEAR $L$aad_loop_1x__func2
+
+
+ vmovdqu8 zmm6,ZMMWORD[((256-256))+rdx]
+ vmovdqu8 zmm7,ZMMWORD[((256-192))+rdx]
+ vmovdqu8 zmm8,ZMMWORD[((256-128))+rdx]
+
+
+$L$aad_loop_4x__func2:
+ vmovdqu8 zmm0,ZMMWORD[r8]
+ vmovdqu8 zmm1,ZMMWORD[64+r8]
+ vmovdqu8 zmm2,ZMMWORD[128+r8]
+ vmovdqu8 zmm3,ZMMWORD[192+r8]
+ vpshufb zmm0,zmm0,zmm4
+ vpxord zmm0,zmm0,zmm5
+ vpshufb zmm1,zmm1,zmm4
+ vpshufb zmm2,zmm2,zmm4
+ vpshufb zmm3,zmm3,zmm4
+ vpclmulqdq zmm5,zmm0,zmm6,0x00
+ vpclmulqdq zmm11,zmm1,zmm7,0x00
+ vpclmulqdq zmm12,zmm2,zmm8,0x00
+ vpxord zmm5,zmm5,zmm11
+ vpclmulqdq zmm13,zmm3,zmm9,0x00
+ vpternlogd zmm5,zmm12,zmm13,0x96
+ vpclmulqdq zmm11,zmm0,zmm6,0x01
+ vpclmulqdq zmm12,zmm1,zmm7,0x01
+ vpclmulqdq zmm13,zmm2,zmm8,0x01
+ vpternlogd zmm11,zmm12,zmm13,0x96
+ vpclmulqdq zmm12,zmm3,zmm9,0x01
+ vpclmulqdq zmm13,zmm0,zmm6,0x10
+ vpternlogd zmm11,zmm12,zmm13,0x96
+ vpclmulqdq zmm12,zmm1,zmm7,0x10
+ vpclmulqdq zmm13,zmm2,zmm8,0x10
+ vpternlogd zmm11,zmm12,zmm13,0x96
+ vpclmulqdq zmm13,zmm10,zmm5,0x01
+ vpclmulqdq zmm12,zmm3,zmm9,0x10
+ vpxord zmm11,zmm11,zmm12
+ vpshufd zmm5,zmm5,0x4e
+ vpclmulqdq zmm0,zmm0,zmm6,0x11
+ vpclmulqdq zmm1,zmm1,zmm7,0x11
+ vpclmulqdq zmm2,zmm2,zmm8,0x11
+ vpternlogd zmm11,zmm5,zmm13,0x96
+ vpclmulqdq zmm3,zmm3,zmm9,0x11
+ vpternlogd zmm0,zmm1,zmm2,0x96
+ vpclmulqdq zmm12,zmm10,zmm11,0x01
+ vpxord zmm5,zmm0,zmm3
+ vpshufd zmm11,zmm11,0x4e
+ vpternlogd zmm5,zmm11,zmm12,0x96
+ vextracti32x4 xmm0,zmm5,1
+ vextracti32x4 xmm1,zmm5,2
+ vextracti32x4 xmm2,zmm5,3
+ vpxord xmm5,xmm5,xmm0
+ vpternlogd xmm5,xmm2,xmm1,0x96
+
+ sub r8,-4*64
+ add r9,-4*64
+ cmp r9,4*64-1
+ ja NEAR $L$aad_loop_4x__func2
+
+
+ cmp r9,64
+ jb NEAR $L$aad_large_done__func2
+$L$aad_loop_1x__func2:
+ vmovdqu8 zmm0,ZMMWORD[r8]
+ vpshufb zmm0,zmm0,zmm4
+ vpxord zmm5,zmm5,zmm0
+ vpclmulqdq zmm0,zmm5,zmm9,0x00
+ vpclmulqdq zmm1,zmm5,zmm9,0x01
+ vpclmulqdq zmm2,zmm5,zmm9,0x10
+ vpxord zmm1,zmm1,zmm2
+ vpclmulqdq zmm2,zmm10,zmm0,0x01
+ vpshufd zmm0,zmm0,0x4e
+ vpternlogd zmm1,zmm0,zmm2,0x96
+ vpclmulqdq zmm5,zmm5,zmm9,0x11
+ vpclmulqdq zmm0,zmm10,zmm1,0x01
+ vpshufd zmm1,zmm1,0x4e
+ vpternlogd zmm5,zmm1,zmm0,0x96
+
+ vextracti32x4 xmm0,zmm5,1
+ vextracti32x4 xmm1,zmm5,2
+ vextracti32x4 xmm2,zmm5,3
+ vpxord xmm5,xmm5,xmm0
+ vpternlogd xmm5,xmm2,xmm1,0x96
+
+ add r8,64
+ sub r9,64
+ cmp r9,64
+ jae NEAR $L$aad_loop_1x__func2
+
+$L$aad_large_done__func2:
+
+
+ vzeroupper
+
+
+$L$aad_blockbyblock__func2:
+ test r9,r9
+ jz NEAR $L$aad_done__func2
+ vmovdqu xmm9,XMMWORD[((256-16))+rdx]
+$L$aad_loop_blockbyblock__func2:
+ vmovdqu xmm0,XMMWORD[r8]
+ vpshufb xmm0,xmm0,xmm4
+ vpxor xmm5,xmm5,xmm0
+ vpclmulqdq xmm0,xmm5,xmm9,0x00
+ vpclmulqdq xmm1,xmm5,xmm9,0x01
+ vpclmulqdq xmm2,xmm5,xmm9,0x10
+ vpxord xmm1,xmm1,xmm2
+ vpclmulqdq xmm2,xmm10,xmm0,0x01
+ vpshufd xmm0,xmm0,0x4e
+ vpternlogd xmm1,xmm0,xmm2,0x96
+ vpclmulqdq xmm5,xmm5,xmm9,0x11
+ vpclmulqdq xmm0,xmm10,xmm1,0x01
+ vpshufd xmm1,xmm1,0x4e
+ vpternlogd xmm5,xmm1,xmm0,0x96
+
+ add r8,16
+ sub r9,16
+ jnz NEAR $L$aad_loop_blockbyblock__func2
+
+$L$aad_done__func2:
+
+ vpshufb xmm5,xmm5,xmm4
+ vmovdqu XMMWORD[rcx],xmm5
+ movdqa xmm6,XMMWORD[rsp]
+ movdqa xmm7,XMMWORD[16+rsp]
+ movdqa xmm8,XMMWORD[32+rsp]
+ movdqa xmm9,XMMWORD[48+rsp]
+ movdqa xmm10,XMMWORD[64+rsp]
+ movdqa xmm11,XMMWORD[80+rsp]
+ movdqa xmm12,XMMWORD[96+rsp]
+ movdqa xmm13,XMMWORD[112+rsp]
+ add rsp,136
+ ret
+$L$SEH_end_gcm_ghash_vpclmulqdq_avx10_512_12:
+
+
+global aes_gcm_enc_update_vaes_avx10_512
+
+ALIGN 32
+aes_gcm_enc_update_vaes_avx10_512:
+
+$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1:
+_CET_ENDBR
+ push rsi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_2:
+ push rdi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_3:
+ push r12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_4:
+
+ mov rsi,QWORD[64+rsp]
+ mov rdi,QWORD[72+rsp]
+ mov r12,QWORD[80+rsp]
+ sub rsp,160
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_5:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_6:
+ movdqa XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_7:
+ movdqa XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_8:
+ movdqa XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_9:
+ movdqa XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_10:
+ movdqa XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_11:
+ movdqa XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_12:
+ movdqa XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_13:
+ movdqa XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_14:
+ movdqa XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_15:
+
+$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx10_512_16:
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN BORINGSSL_function_hit
+ mov BYTE[((BORINGSSL_function_hit+7))],1
+%endif
+
+ vbroadcasti32x4 zmm8,ZMMWORD[$L$bswap_mask]
+ vbroadcasti32x4 zmm31,ZMMWORD[$L$gfpoly]
+
+
+
+ vmovdqu xmm10,XMMWORD[r12]
+ vpshufb xmm10,xmm10,xmm8
+ vbroadcasti32x4 zmm12,ZMMWORD[rsi]
+ vpshufb zmm12,zmm12,zmm8
+
+
+
+ mov r10d,DWORD[240+r9]
+ lea r10d,[((-20))+r10*4]
+
+
+
+
+ lea r11,[96+r10*4+r9]
+ vbroadcasti32x4 zmm13,ZMMWORD[r9]
+ vbroadcasti32x4 zmm14,ZMMWORD[r11]
+
+
+ vpaddd zmm12,zmm12,ZMMWORD[$L$ctr_pattern]
+
+
+ vbroadcasti32x4 zmm11,ZMMWORD[$L$inc_4blocks]
+
+
+
+ cmp r8,4*64-1
+ jbe NEAR $L$crypt_loop_4x_done__func3
+
+
+ vmovdqu8 zmm27,ZMMWORD[((256-256))+rdi]
+ vmovdqu8 zmm28,ZMMWORD[((256-192))+rdi]
+ vmovdqu8 zmm29,ZMMWORD[((256-128))+rdi]
+ vmovdqu8 zmm30,ZMMWORD[((256-64))+rdi]
+
+
+
+
+ vpshufb zmm0,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+ vpshufb zmm1,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+ vpshufb zmm2,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+ vpshufb zmm3,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+
+
+ vpxord zmm0,zmm0,zmm13
+ vpxord zmm1,zmm1,zmm13
+ vpxord zmm2,zmm2,zmm13
+ vpxord zmm3,zmm3,zmm13
+
+ lea rax,[16+r9]
+$L$vaesenc_loop_first_4_vecs__func3:
+ vbroadcasti32x4 zmm9,ZMMWORD[rax]
+ vaesenc zmm0,zmm0,zmm9
+ vaesenc zmm1,zmm1,zmm9
+ vaesenc zmm2,zmm2,zmm9
+ vaesenc zmm3,zmm3,zmm9
+
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_first_4_vecs__func3
+
+
+
+ vpxord zmm4,zmm14,ZMMWORD[rcx]
+ vpxord zmm5,zmm14,ZMMWORD[64+rcx]
+ vpxord zmm6,zmm14,ZMMWORD[128+rcx]
+ vpxord zmm7,zmm14,ZMMWORD[192+rcx]
+
+
+
+ vaesenclast zmm4,zmm0,zmm4
+ vaesenclast zmm5,zmm1,zmm5
+ vaesenclast zmm6,zmm2,zmm6
+ vaesenclast zmm7,zmm3,zmm7
+
+
+ vmovdqu8 ZMMWORD[rdx],zmm4
+ vmovdqu8 ZMMWORD[64+rdx],zmm5
+ vmovdqu8 ZMMWORD[128+rdx],zmm6
+ vmovdqu8 ZMMWORD[192+rdx],zmm7
+
+ sub rcx,-4*64
+ sub rdx,-4*64
+ add r8,-4*64
+ cmp r8,4*64-1
+ jbe NEAR $L$ghash_last_ciphertext_4x__func3
+ vbroadcasti32x4 zmm15,ZMMWORD[((-144))+r11]
+ vbroadcasti32x4 zmm16,ZMMWORD[((-128))+r11]
+ vbroadcasti32x4 zmm17,ZMMWORD[((-112))+r11]
+ vbroadcasti32x4 zmm18,ZMMWORD[((-96))+r11]
+ vbroadcasti32x4 zmm19,ZMMWORD[((-80))+r11]
+ vbroadcasti32x4 zmm20,ZMMWORD[((-64))+r11]
+ vbroadcasti32x4 zmm21,ZMMWORD[((-48))+r11]
+ vbroadcasti32x4 zmm22,ZMMWORD[((-32))+r11]
+ vbroadcasti32x4 zmm23,ZMMWORD[((-16))+r11]
+$L$crypt_loop_4x__func3:
+
+
+
+ vpshufb zmm0,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+ vpshufb zmm1,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+ vpshufb zmm2,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+ vpshufb zmm3,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+
+
+ vpxord zmm0,zmm0,zmm13
+ vpxord zmm1,zmm1,zmm13
+ vpxord zmm2,zmm2,zmm13
+ vpxord zmm3,zmm3,zmm13
+
+ cmp r10d,24
+ jl NEAR $L$aes128__func3
+ je NEAR $L$aes192__func3
+
+ vbroadcasti32x4 zmm9,ZMMWORD[((-208))+r11]
+ vaesenc zmm0,zmm0,zmm9
+ vaesenc zmm1,zmm1,zmm9
+ vaesenc zmm2,zmm2,zmm9
+ vaesenc zmm3,zmm3,zmm9
+
+ vbroadcasti32x4 zmm9,ZMMWORD[((-192))+r11]
+ vaesenc zmm0,zmm0,zmm9
+ vaesenc zmm1,zmm1,zmm9
+ vaesenc zmm2,zmm2,zmm9
+ vaesenc zmm3,zmm3,zmm9
+
+$L$aes192__func3:
+ vbroadcasti32x4 zmm9,ZMMWORD[((-176))+r11]
+ vaesenc zmm0,zmm0,zmm9
+ vaesenc zmm1,zmm1,zmm9
+ vaesenc zmm2,zmm2,zmm9
+ vaesenc zmm3,zmm3,zmm9
+
+ vbroadcasti32x4 zmm9,ZMMWORD[((-160))+r11]
+ vaesenc zmm0,zmm0,zmm9
+ vaesenc zmm1,zmm1,zmm9
+ vaesenc zmm2,zmm2,zmm9
+ vaesenc zmm3,zmm3,zmm9
+
+$L$aes128__func3:
+ vpshufb zmm4,zmm4,zmm8
+ vpxord zmm4,zmm4,zmm10
+ vpshufb zmm5,zmm5,zmm8
+ vpshufb zmm6,zmm6,zmm8
+
+ vaesenc zmm0,zmm0,zmm15
+ vaesenc zmm1,zmm1,zmm15
+ vaesenc zmm2,zmm2,zmm15
+ vaesenc zmm3,zmm3,zmm15
+
+ vpshufb zmm7,zmm7,zmm8
+ vpclmulqdq zmm10,zmm4,zmm27,0x00
+ vpclmulqdq zmm24,zmm5,zmm28,0x00
+ vpclmulqdq zmm25,zmm6,zmm29,0x00
+
+ vaesenc zmm0,zmm0,zmm16
+ vaesenc zmm1,zmm1,zmm16
+ vaesenc zmm2,zmm2,zmm16
+ vaesenc zmm3,zmm3,zmm16
+
+ vpxord zmm10,zmm10,zmm24
+ vpclmulqdq zmm26,zmm7,zmm30,0x00
+ vpternlogd zmm10,zmm25,zmm26,0x96
+ vpclmulqdq zmm24,zmm4,zmm27,0x01
+
+ vaesenc zmm0,zmm0,zmm17
+ vaesenc zmm1,zmm1,zmm17
+ vaesenc zmm2,zmm2,zmm17
+ vaesenc zmm3,zmm3,zmm17
+
+ vpclmulqdq zmm25,zmm5,zmm28,0x01
+ vpclmulqdq zmm26,zmm6,zmm29,0x01
+ vpternlogd zmm24,zmm25,zmm26,0x96
+ vpclmulqdq zmm25,zmm7,zmm30,0x01
+
+ vaesenc zmm0,zmm0,zmm18
+ vaesenc zmm1,zmm1,zmm18
+ vaesenc zmm2,zmm2,zmm18
+ vaesenc zmm3,zmm3,zmm18
+
+ vpclmulqdq zmm26,zmm4,zmm27,0x10
+ vpternlogd zmm24,zmm25,zmm26,0x96
+ vpclmulqdq zmm25,zmm5,zmm28,0x10
+ vpclmulqdq zmm26,zmm6,zmm29,0x10
+
+ vaesenc zmm0,zmm0,zmm19
+ vaesenc zmm1,zmm1,zmm19
+ vaesenc zmm2,zmm2,zmm19
+ vaesenc zmm3,zmm3,zmm19
+
+ vpternlogd zmm24,zmm25,zmm26,0x96
+ vpclmulqdq zmm26,zmm31,zmm10,0x01
+ vpclmulqdq zmm25,zmm7,zmm30,0x10
+ vpxord zmm24,zmm24,zmm25
+
+ vaesenc zmm0,zmm0,zmm20
+ vaesenc zmm1,zmm1,zmm20
+ vaesenc zmm2,zmm2,zmm20
+ vaesenc zmm3,zmm3,zmm20
+
+ vpshufd zmm10,zmm10,0x4e
+ vpclmulqdq zmm4,zmm4,zmm27,0x11
+ vpclmulqdq zmm5,zmm5,zmm28,0x11
+ vpclmulqdq zmm6,zmm6,zmm29,0x11
+
+ vaesenc zmm0,zmm0,zmm21
+ vaesenc zmm1,zmm1,zmm21
+ vaesenc zmm2,zmm2,zmm21
+ vaesenc zmm3,zmm3,zmm21
+
+ vpternlogd zmm24,zmm10,zmm26,0x96
+ vpclmulqdq zmm7,zmm7,zmm30,0x11
+ vpternlogd zmm4,zmm5,zmm6,0x96
+ vpclmulqdq zmm25,zmm31,zmm24,0x01
+
+ vaesenc zmm0,zmm0,zmm22
+ vaesenc zmm1,zmm1,zmm22
+ vaesenc zmm2,zmm2,zmm22
+ vaesenc zmm3,zmm3,zmm22
+
+ vpxord zmm10,zmm4,zmm7
+ vpshufd zmm24,zmm24,0x4e
+ vpternlogd zmm10,zmm24,zmm25,0x96
+
+ vaesenc zmm0,zmm0,zmm23
+ vaesenc zmm1,zmm1,zmm23
+ vaesenc zmm2,zmm2,zmm23
+ vaesenc zmm3,zmm3,zmm23
+
+ vextracti32x4 xmm4,zmm10,1
+ vextracti32x4 xmm5,zmm10,2
+ vextracti32x4 xmm6,zmm10,3
+ vpxord xmm10,xmm10,xmm4
+ vpternlogd xmm10,xmm6,xmm5,0x96
+
+
+
+
+ vpxord zmm4,zmm14,ZMMWORD[rcx]
+ vpxord zmm5,zmm14,ZMMWORD[64+rcx]
+ vpxord zmm6,zmm14,ZMMWORD[128+rcx]
+ vpxord zmm7,zmm14,ZMMWORD[192+rcx]
+
+
+
+ vaesenclast zmm4,zmm0,zmm4
+ vaesenclast zmm5,zmm1,zmm5
+ vaesenclast zmm6,zmm2,zmm6
+ vaesenclast zmm7,zmm3,zmm7
+
+
+ vmovdqu8 ZMMWORD[rdx],zmm4
+ vmovdqu8 ZMMWORD[64+rdx],zmm5
+ vmovdqu8 ZMMWORD[128+rdx],zmm6
+ vmovdqu8 ZMMWORD[192+rdx],zmm7
+
+ sub rcx,-4*64
+ sub rdx,-4*64
+ add r8,-4*64
+ cmp r8,4*64-1
+ ja NEAR $L$crypt_loop_4x__func3
+$L$ghash_last_ciphertext_4x__func3:
+ vpshufb zmm4,zmm4,zmm8
+ vpxord zmm4,zmm4,zmm10
+ vpshufb zmm5,zmm5,zmm8
+ vpshufb zmm6,zmm6,zmm8
+ vpshufb zmm7,zmm7,zmm8
+ vpclmulqdq zmm10,zmm4,zmm27,0x00
+ vpclmulqdq zmm24,zmm5,zmm28,0x00
+ vpclmulqdq zmm25,zmm6,zmm29,0x00
+ vpxord zmm10,zmm10,zmm24
+ vpclmulqdq zmm26,zmm7,zmm30,0x00
+ vpternlogd zmm10,zmm25,zmm26,0x96
+ vpclmulqdq zmm24,zmm4,zmm27,0x01
+ vpclmulqdq zmm25,zmm5,zmm28,0x01
+ vpclmulqdq zmm26,zmm6,zmm29,0x01
+ vpternlogd zmm24,zmm25,zmm26,0x96
+ vpclmulqdq zmm25,zmm7,zmm30,0x01
+ vpclmulqdq zmm26,zmm4,zmm27,0x10
+ vpternlogd zmm24,zmm25,zmm26,0x96
+ vpclmulqdq zmm25,zmm5,zmm28,0x10
+ vpclmulqdq zmm26,zmm6,zmm29,0x10
+ vpternlogd zmm24,zmm25,zmm26,0x96
+ vpclmulqdq zmm26,zmm31,zmm10,0x01
+ vpclmulqdq zmm25,zmm7,zmm30,0x10
+ vpxord zmm24,zmm24,zmm25
+ vpshufd zmm10,zmm10,0x4e
+ vpclmulqdq zmm4,zmm4,zmm27,0x11
+ vpclmulqdq zmm5,zmm5,zmm28,0x11
+ vpclmulqdq zmm6,zmm6,zmm29,0x11
+ vpternlogd zmm24,zmm10,zmm26,0x96
+ vpclmulqdq zmm7,zmm7,zmm30,0x11
+ vpternlogd zmm4,zmm5,zmm6,0x96
+ vpclmulqdq zmm25,zmm31,zmm24,0x01
+ vpxord zmm10,zmm4,zmm7
+ vpshufd zmm24,zmm24,0x4e
+ vpternlogd zmm10,zmm24,zmm25,0x96
+ vextracti32x4 xmm4,zmm10,1
+ vextracti32x4 xmm5,zmm10,2
+ vextracti32x4 xmm6,zmm10,3
+ vpxord xmm10,xmm10,xmm4
+ vpternlogd xmm10,xmm6,xmm5,0x96
+
+$L$crypt_loop_4x_done__func3:
+
+ test r8,r8
+ jz NEAR $L$done__func3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ mov rax,r8
+ neg rax
+ and rax,-16
+ lea rsi,[256+rax*1+rdi]
+ vpxor xmm4,xmm4,xmm4
+ vpxor xmm5,xmm5,xmm5
+ vpxor xmm6,xmm6,xmm6
+
+ cmp r8,64
+ jb NEAR $L$partial_vec__func3
+
+$L$crypt_loop_1x__func3:
+
+
+
+ vpshufb zmm0,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+ vpxord zmm0,zmm0,zmm13
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_full_vec__func3:
+ vbroadcasti32x4 zmm9,ZMMWORD[rax]
+ vaesenc zmm0,zmm0,zmm9
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_full_vec__func3
+ vaesenclast zmm0,zmm0,zmm14
+
+
+ vmovdqu8 zmm1,ZMMWORD[rcx]
+ vpxord zmm0,zmm0,zmm1
+ vmovdqu8 ZMMWORD[rdx],zmm0
+
+
+ vmovdqu8 zmm30,ZMMWORD[rsi]
+ vpshufb zmm0,zmm0,zmm8
+ vpxord zmm0,zmm0,zmm10
+ vpclmulqdq zmm7,zmm0,zmm30,0x00
+ vpclmulqdq zmm1,zmm0,zmm30,0x01
+ vpclmulqdq zmm2,zmm0,zmm30,0x10
+ vpclmulqdq zmm3,zmm0,zmm30,0x11
+ vpxord zmm4,zmm4,zmm7
+ vpternlogd zmm5,zmm1,zmm2,0x96
+ vpxord zmm6,zmm6,zmm3
+
+ vpxor xmm10,xmm10,xmm10
+
+ add rsi,64
+ add rcx,64
+ add rdx,64
+ sub r8,64
+ cmp r8,64
+ jae NEAR $L$crypt_loop_1x__func3
+
+ test r8,r8
+ jz NEAR $L$reduce__func3
+
+$L$partial_vec__func3:
+
+
+
+
+ mov rax,-1
+ bzhi rax,rax,r8
+ kmovq k1,rax
+ add r8,15
+ and r8,-16
+ mov rax,-1
+ bzhi rax,rax,r8
+ kmovq k2,rax
+
+
+
+ vpshufb zmm0,zmm12,zmm8
+ vpxord zmm0,zmm0,zmm13
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_partialvec__func3:
+ vbroadcasti32x4 zmm9,ZMMWORD[rax]
+ vaesenc zmm0,zmm0,zmm9
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_partialvec__func3
+ vaesenclast zmm0,zmm0,zmm14
+
+
+ vmovdqu8 zmm1{k1}{z},[rcx]
+ vpxord zmm0,zmm0,zmm1
+ vmovdqu8 ZMMWORD[rdx]{k1},zmm0
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 zmm30{k2}{z},[rsi]
+ vmovdqu8 zmm1{k1}{z},zmm0
+ vpshufb zmm0,zmm1,zmm8
+ vpxord zmm0,zmm0,zmm10
+ vpclmulqdq zmm7,zmm0,zmm30,0x00
+ vpclmulqdq zmm1,zmm0,zmm30,0x01
+ vpclmulqdq zmm2,zmm0,zmm30,0x10
+ vpclmulqdq zmm3,zmm0,zmm30,0x11
+ vpxord zmm4,zmm4,zmm7
+ vpternlogd zmm5,zmm1,zmm2,0x96
+ vpxord zmm6,zmm6,zmm3
+
+
+$L$reduce__func3:
+
+ vpclmulqdq zmm0,zmm31,zmm4,0x01
+ vpshufd zmm4,zmm4,0x4e
+ vpternlogd zmm5,zmm4,zmm0,0x96
+ vpclmulqdq zmm0,zmm31,zmm5,0x01
+ vpshufd zmm5,zmm5,0x4e
+ vpternlogd zmm6,zmm5,zmm0,0x96
+
+ vextracti32x4 xmm0,zmm6,1
+ vextracti32x4 xmm1,zmm6,2
+ vextracti32x4 xmm2,zmm6,3
+ vpxord xmm10,xmm6,xmm0
+ vpternlogd xmm10,xmm2,xmm1,0x96
+
+
+$L$done__func3:
+
+ vpshufb xmm10,xmm10,xmm8
+ vmovdqu XMMWORD[r12],xmm10
+
+ vzeroupper
+ movdqa xmm6,XMMWORD[rsp]
+ movdqa xmm7,XMMWORD[16+rsp]
+ movdqa xmm8,XMMWORD[32+rsp]
+ movdqa xmm9,XMMWORD[48+rsp]
+ movdqa xmm10,XMMWORD[64+rsp]
+ movdqa xmm11,XMMWORD[80+rsp]
+ movdqa xmm12,XMMWORD[96+rsp]
+ movdqa xmm13,XMMWORD[112+rsp]
+ movdqa xmm14,XMMWORD[128+rsp]
+ movdqa xmm15,XMMWORD[144+rsp]
+ add rsp,160
+ pop r12
+ pop rdi
+ pop rsi
+ ret
+$L$SEH_end_aes_gcm_enc_update_vaes_avx10_512_17:
+
+
+global aes_gcm_dec_update_vaes_avx10_512
+
+ALIGN 32
+aes_gcm_dec_update_vaes_avx10_512:
+
+$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1:
+_CET_ENDBR
+ push rsi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_2:
+ push rdi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_3:
+ push r12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_4:
+
+ mov rsi,QWORD[64+rsp]
+ mov rdi,QWORD[72+rsp]
+ mov r12,QWORD[80+rsp]
+ sub rsp,160
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_5:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_6:
+ movdqa XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_7:
+ movdqa XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_8:
+ movdqa XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_9:
+ movdqa XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_10:
+ movdqa XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_11:
+ movdqa XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_12:
+ movdqa XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_13:
+ movdqa XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_14:
+ movdqa XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_15:
+
+$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx10_512_16:
+
+ vbroadcasti32x4 zmm8,ZMMWORD[$L$bswap_mask]
+ vbroadcasti32x4 zmm31,ZMMWORD[$L$gfpoly]
+
+
+
+ vmovdqu xmm10,XMMWORD[r12]
+ vpshufb xmm10,xmm10,xmm8
+ vbroadcasti32x4 zmm12,ZMMWORD[rsi]
+ vpshufb zmm12,zmm12,zmm8
+
+
+
+ mov r10d,DWORD[240+r9]
+ lea r10d,[((-20))+r10*4]
+
+
+
+
+ lea r11,[96+r10*4+r9]
+ vbroadcasti32x4 zmm13,ZMMWORD[r9]
+ vbroadcasti32x4 zmm14,ZMMWORD[r11]
+
+
+ vpaddd zmm12,zmm12,ZMMWORD[$L$ctr_pattern]
+
+
+ vbroadcasti32x4 zmm11,ZMMWORD[$L$inc_4blocks]
+
+
+
+ cmp r8,4*64-1
+ jbe NEAR $L$crypt_loop_4x_done__func4
+
+
+ vmovdqu8 zmm27,ZMMWORD[((256-256))+rdi]
+ vmovdqu8 zmm28,ZMMWORD[((256-192))+rdi]
+ vmovdqu8 zmm29,ZMMWORD[((256-128))+rdi]
+ vmovdqu8 zmm30,ZMMWORD[((256-64))+rdi]
+ vbroadcasti32x4 zmm15,ZMMWORD[((-144))+r11]
+ vbroadcasti32x4 zmm16,ZMMWORD[((-128))+r11]
+ vbroadcasti32x4 zmm17,ZMMWORD[((-112))+r11]
+ vbroadcasti32x4 zmm18,ZMMWORD[((-96))+r11]
+ vbroadcasti32x4 zmm19,ZMMWORD[((-80))+r11]
+ vbroadcasti32x4 zmm20,ZMMWORD[((-64))+r11]
+ vbroadcasti32x4 zmm21,ZMMWORD[((-48))+r11]
+ vbroadcasti32x4 zmm22,ZMMWORD[((-32))+r11]
+ vbroadcasti32x4 zmm23,ZMMWORD[((-16))+r11]
+$L$crypt_loop_4x__func4:
+ vmovdqu8 zmm4,ZMMWORD[rcx]
+ vmovdqu8 zmm5,ZMMWORD[64+rcx]
+ vmovdqu8 zmm6,ZMMWORD[128+rcx]
+ vmovdqu8 zmm7,ZMMWORD[192+rcx]
+
+
+
+ vpshufb zmm0,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+ vpshufb zmm1,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+ vpshufb zmm2,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+ vpshufb zmm3,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+
+
+ vpxord zmm0,zmm0,zmm13
+ vpxord zmm1,zmm1,zmm13
+ vpxord zmm2,zmm2,zmm13
+ vpxord zmm3,zmm3,zmm13
+
+ cmp r10d,24
+ jl NEAR $L$aes128__func4
+ je NEAR $L$aes192__func4
+
+ vbroadcasti32x4 zmm9,ZMMWORD[((-208))+r11]
+ vaesenc zmm0,zmm0,zmm9
+ vaesenc zmm1,zmm1,zmm9
+ vaesenc zmm2,zmm2,zmm9
+ vaesenc zmm3,zmm3,zmm9
+
+ vbroadcasti32x4 zmm9,ZMMWORD[((-192))+r11]
+ vaesenc zmm0,zmm0,zmm9
+ vaesenc zmm1,zmm1,zmm9
+ vaesenc zmm2,zmm2,zmm9
+ vaesenc zmm3,zmm3,zmm9
+
+$L$aes192__func4:
+ vbroadcasti32x4 zmm9,ZMMWORD[((-176))+r11]
+ vaesenc zmm0,zmm0,zmm9
+ vaesenc zmm1,zmm1,zmm9
+ vaesenc zmm2,zmm2,zmm9
+ vaesenc zmm3,zmm3,zmm9
+
+ vbroadcasti32x4 zmm9,ZMMWORD[((-160))+r11]
+ vaesenc zmm0,zmm0,zmm9
+ vaesenc zmm1,zmm1,zmm9
+ vaesenc zmm2,zmm2,zmm9
+ vaesenc zmm3,zmm3,zmm9
+
+$L$aes128__func4:
+ vpshufb zmm4,zmm4,zmm8
+ vpxord zmm4,zmm4,zmm10
+ vpshufb zmm5,zmm5,zmm8
+ vpshufb zmm6,zmm6,zmm8
+
+ vaesenc zmm0,zmm0,zmm15
+ vaesenc zmm1,zmm1,zmm15
+ vaesenc zmm2,zmm2,zmm15
+ vaesenc zmm3,zmm3,zmm15
+
+ vpshufb zmm7,zmm7,zmm8
+ vpclmulqdq zmm10,zmm4,zmm27,0x00
+ vpclmulqdq zmm24,zmm5,zmm28,0x00
+ vpclmulqdq zmm25,zmm6,zmm29,0x00
+
+ vaesenc zmm0,zmm0,zmm16
+ vaesenc zmm1,zmm1,zmm16
+ vaesenc zmm2,zmm2,zmm16
+ vaesenc zmm3,zmm3,zmm16
+
+ vpxord zmm10,zmm10,zmm24
+ vpclmulqdq zmm26,zmm7,zmm30,0x00
+ vpternlogd zmm10,zmm25,zmm26,0x96
+ vpclmulqdq zmm24,zmm4,zmm27,0x01
+
+ vaesenc zmm0,zmm0,zmm17
+ vaesenc zmm1,zmm1,zmm17
+ vaesenc zmm2,zmm2,zmm17
+ vaesenc zmm3,zmm3,zmm17
+
+ vpclmulqdq zmm25,zmm5,zmm28,0x01
+ vpclmulqdq zmm26,zmm6,zmm29,0x01
+ vpternlogd zmm24,zmm25,zmm26,0x96
+ vpclmulqdq zmm25,zmm7,zmm30,0x01
+
+ vaesenc zmm0,zmm0,zmm18
+ vaesenc zmm1,zmm1,zmm18
+ vaesenc zmm2,zmm2,zmm18
+ vaesenc zmm3,zmm3,zmm18
+
+ vpclmulqdq zmm26,zmm4,zmm27,0x10
+ vpternlogd zmm24,zmm25,zmm26,0x96
+ vpclmulqdq zmm25,zmm5,zmm28,0x10
+ vpclmulqdq zmm26,zmm6,zmm29,0x10
+
+ vaesenc zmm0,zmm0,zmm19
+ vaesenc zmm1,zmm1,zmm19
+ vaesenc zmm2,zmm2,zmm19
+ vaesenc zmm3,zmm3,zmm19
+
+ vpternlogd zmm24,zmm25,zmm26,0x96
+ vpclmulqdq zmm26,zmm31,zmm10,0x01
+ vpclmulqdq zmm25,zmm7,zmm30,0x10
+ vpxord zmm24,zmm24,zmm25
+
+ vaesenc zmm0,zmm0,zmm20
+ vaesenc zmm1,zmm1,zmm20
+ vaesenc zmm2,zmm2,zmm20
+ vaesenc zmm3,zmm3,zmm20
+
+ vpshufd zmm10,zmm10,0x4e
+ vpclmulqdq zmm4,zmm4,zmm27,0x11
+ vpclmulqdq zmm5,zmm5,zmm28,0x11
+ vpclmulqdq zmm6,zmm6,zmm29,0x11
+
+ vaesenc zmm0,zmm0,zmm21
+ vaesenc zmm1,zmm1,zmm21
+ vaesenc zmm2,zmm2,zmm21
+ vaesenc zmm3,zmm3,zmm21
+
+ vpternlogd zmm24,zmm10,zmm26,0x96
+ vpclmulqdq zmm7,zmm7,zmm30,0x11
+ vpternlogd zmm4,zmm5,zmm6,0x96
+ vpclmulqdq zmm25,zmm31,zmm24,0x01
+
+ vaesenc zmm0,zmm0,zmm22
+ vaesenc zmm1,zmm1,zmm22
+ vaesenc zmm2,zmm2,zmm22
+ vaesenc zmm3,zmm3,zmm22
+
+ vpxord zmm10,zmm4,zmm7
+ vpshufd zmm24,zmm24,0x4e
+ vpternlogd zmm10,zmm24,zmm25,0x96
+
+ vaesenc zmm0,zmm0,zmm23
+ vaesenc zmm1,zmm1,zmm23
+ vaesenc zmm2,zmm2,zmm23
+ vaesenc zmm3,zmm3,zmm23
+
+ vextracti32x4 xmm4,zmm10,1
+ vextracti32x4 xmm5,zmm10,2
+ vextracti32x4 xmm6,zmm10,3
+ vpxord xmm10,xmm10,xmm4
+ vpternlogd xmm10,xmm6,xmm5,0x96
+
+
+
+
+ vpxord zmm4,zmm14,ZMMWORD[rcx]
+ vpxord zmm5,zmm14,ZMMWORD[64+rcx]
+ vpxord zmm6,zmm14,ZMMWORD[128+rcx]
+ vpxord zmm7,zmm14,ZMMWORD[192+rcx]
+
+
+
+ vaesenclast zmm4,zmm0,zmm4
+ vaesenclast zmm5,zmm1,zmm5
+ vaesenclast zmm6,zmm2,zmm6
+ vaesenclast zmm7,zmm3,zmm7
+
+
+ vmovdqu8 ZMMWORD[rdx],zmm4
+ vmovdqu8 ZMMWORD[64+rdx],zmm5
+ vmovdqu8 ZMMWORD[128+rdx],zmm6
+ vmovdqu8 ZMMWORD[192+rdx],zmm7
+
+ sub rcx,-4*64
+ sub rdx,-4*64
+ add r8,-4*64
+ cmp r8,4*64-1
+ ja NEAR $L$crypt_loop_4x__func4
+$L$crypt_loop_4x_done__func4:
+
+ test r8,r8
+ jz NEAR $L$done__func4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ mov rax,r8
+ neg rax
+ and rax,-16
+ lea rsi,[256+rax*1+rdi]
+ vpxor xmm4,xmm4,xmm4
+ vpxor xmm5,xmm5,xmm5
+ vpxor xmm6,xmm6,xmm6
+
+ cmp r8,64
+ jb NEAR $L$partial_vec__func4
+
+$L$crypt_loop_1x__func4:
+
+
+
+ vpshufb zmm0,zmm12,zmm8
+ vpaddd zmm12,zmm12,zmm11
+ vpxord zmm0,zmm0,zmm13
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_full_vec__func4:
+ vbroadcasti32x4 zmm9,ZMMWORD[rax]
+ vaesenc zmm0,zmm0,zmm9
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_full_vec__func4
+ vaesenclast zmm0,zmm0,zmm14
+
+
+ vmovdqu8 zmm1,ZMMWORD[rcx]
+ vpxord zmm0,zmm0,zmm1
+ vmovdqu8 ZMMWORD[rdx],zmm0
+
+
+ vmovdqu8 zmm30,ZMMWORD[rsi]
+ vpshufb zmm0,zmm1,zmm8
+ vpxord zmm0,zmm0,zmm10
+ vpclmulqdq zmm7,zmm0,zmm30,0x00
+ vpclmulqdq zmm1,zmm0,zmm30,0x01
+ vpclmulqdq zmm2,zmm0,zmm30,0x10
+ vpclmulqdq zmm3,zmm0,zmm30,0x11
+ vpxord zmm4,zmm4,zmm7
+ vpternlogd zmm5,zmm1,zmm2,0x96
+ vpxord zmm6,zmm6,zmm3
+
+ vpxor xmm10,xmm10,xmm10
+
+ add rsi,64
+ add rcx,64
+ add rdx,64
+ sub r8,64
+ cmp r8,64
+ jae NEAR $L$crypt_loop_1x__func4
+
+ test r8,r8
+ jz NEAR $L$reduce__func4
+
+$L$partial_vec__func4:
+
+
+
+
+ mov rax,-1
+ bzhi rax,rax,r8
+ kmovq k1,rax
+ add r8,15
+ and r8,-16
+ mov rax,-1
+ bzhi rax,rax,r8
+ kmovq k2,rax
+
+
+
+ vpshufb zmm0,zmm12,zmm8
+ vpxord zmm0,zmm0,zmm13
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_partialvec__func4:
+ vbroadcasti32x4 zmm9,ZMMWORD[rax]
+ vaesenc zmm0,zmm0,zmm9
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_partialvec__func4
+ vaesenclast zmm0,zmm0,zmm14
+
+
+ vmovdqu8 zmm1{k1}{z},[rcx]
+ vpxord zmm0,zmm0,zmm1
+ vmovdqu8 ZMMWORD[rdx]{k1},zmm0
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vmovdqu8 zmm30{k2}{z},[rsi]
+
+ vpshufb zmm0,zmm1,zmm8
+ vpxord zmm0,zmm0,zmm10
+ vpclmulqdq zmm7,zmm0,zmm30,0x00
+ vpclmulqdq zmm1,zmm0,zmm30,0x01
+ vpclmulqdq zmm2,zmm0,zmm30,0x10
+ vpclmulqdq zmm3,zmm0,zmm30,0x11
+ vpxord zmm4,zmm4,zmm7
+ vpternlogd zmm5,zmm1,zmm2,0x96
+ vpxord zmm6,zmm6,zmm3
+
+
+$L$reduce__func4:
+
+ vpclmulqdq zmm0,zmm31,zmm4,0x01
+ vpshufd zmm4,zmm4,0x4e
+ vpternlogd zmm5,zmm4,zmm0,0x96
+ vpclmulqdq zmm0,zmm31,zmm5,0x01
+ vpshufd zmm5,zmm5,0x4e
+ vpternlogd zmm6,zmm5,zmm0,0x96
+
+ vextracti32x4 xmm0,zmm6,1
+ vextracti32x4 xmm1,zmm6,2
+ vextracti32x4 xmm2,zmm6,3
+ vpxord xmm10,xmm6,xmm0
+ vpternlogd xmm10,xmm2,xmm1,0x96
+
+
+$L$done__func4:
+
+ vpshufb xmm10,xmm10,xmm8
+ vmovdqu XMMWORD[r12],xmm10
+
+ vzeroupper
+ movdqa xmm6,XMMWORD[rsp]
+ movdqa xmm7,XMMWORD[16+rsp]
+ movdqa xmm8,XMMWORD[32+rsp]
+ movdqa xmm9,XMMWORD[48+rsp]
+ movdqa xmm10,XMMWORD[64+rsp]
+ movdqa xmm11,XMMWORD[80+rsp]
+ movdqa xmm12,XMMWORD[96+rsp]
+ movdqa xmm13,XMMWORD[112+rsp]
+ movdqa xmm14,XMMWORD[128+rsp]
+ movdqa xmm15,XMMWORD[144+rsp]
+ add rsp,160
+ pop r12
+ pop rdi
+ pop rsi
+ ret
+$L$SEH_end_aes_gcm_dec_update_vaes_avx10_512_17:
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_gcm_gmult_vpclmulqdq_avx10_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_gmult_vpclmulqdq_avx10_5 wrt ..imagebase
+ DD $L$SEH_info_gcm_gmult_vpclmulqdq_avx10_0 wrt ..imagebase
+
+ DD $L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_ghash_vpclmulqdq_avx10_256_12 wrt ..imagebase
+ DD $L$SEH_info_gcm_ghash_vpclmulqdq_avx10_256_0 wrt ..imagebase
+
+ DD $L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1 wrt ..imagebase
+ DD $L$SEH_end_aes_gcm_enc_update_vaes_avx10_256_17 wrt ..imagebase
+ DD $L$SEH_info_aes_gcm_enc_update_vaes_avx10_256_0 wrt ..imagebase
+
+ DD $L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1 wrt ..imagebase
+ DD $L$SEH_end_aes_gcm_dec_update_vaes_avx10_256_17 wrt ..imagebase
+ DD $L$SEH_info_aes_gcm_dec_update_vaes_avx10_256_0 wrt ..imagebase
+
+ DD $L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_ghash_vpclmulqdq_avx10_512_12 wrt ..imagebase
+ DD $L$SEH_info_gcm_ghash_vpclmulqdq_avx10_512_0 wrt ..imagebase
+
+ DD $L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1 wrt ..imagebase
+ DD $L$SEH_end_aes_gcm_enc_update_vaes_avx10_512_17 wrt ..imagebase
+ DD $L$SEH_info_aes_gcm_enc_update_vaes_avx10_512_0 wrt ..imagebase
+
+ DD $L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1 wrt ..imagebase
+ DD $L$SEH_end_aes_gcm_dec_update_vaes_avx10_512_17 wrt ..imagebase
+ DD $L$SEH_info_aes_gcm_dec_update_vaes_avx10_512_0 wrt ..imagebase
+
+
+section .xdata rdata align=8
+ALIGN 4
+$L$SEH_info_gcm_gmult_vpclmulqdq_avx10_0:
+ DB 1
+ DB $L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx10_4-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx10_1
+ DB 3
+ DB 0
+ DB $L$SEH_prologue_gcm_gmult_vpclmulqdq_avx10_3-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx10_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_gcm_gmult_vpclmulqdq_avx10_2-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx10_1
+ DB 34
+
+ DW 0
+$L$SEH_info_gcm_ghash_vpclmulqdq_avx10_256_0:
+ DB 1
+ DB $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx10_256_11-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+ DB 18
+ DB 0
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_10-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+ DB 216
+ DW 7
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_9-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+ DB 200
+ DW 6
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_8-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+ DB 184
+ DW 5
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+ DB 168
+ DW 4
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+ DB 152
+ DW 3
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+ DB 136
+ DW 2
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+ DB 120
+ DW 1
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+ DB 1
+ DW 17
+
+$L$SEH_info_aes_gcm_enc_update_vaes_avx10_256_0:
+ DB 1
+ DB $L$SEH_endprologue_aes_gcm_enc_update_vaes_avx10_256_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 25
+ DB 0
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 248
+ DW 9
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 232
+ DW 8
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 216
+ DW 7
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 200
+ DW 6
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 184
+ DW 5
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 168
+ DW 4
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 152
+ DW 3
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 136
+ DW 2
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 120
+ DW 1
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 1
+ DW 20
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 192
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 112
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+ DB 96
+
+ DW 0
+$L$SEH_info_aes_gcm_dec_update_vaes_avx10_256_0:
+ DB 1
+ DB $L$SEH_endprologue_aes_gcm_dec_update_vaes_avx10_256_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 25
+ DB 0
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 248
+ DW 9
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 232
+ DW 8
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 216
+ DW 7
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 200
+ DW 6
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 184
+ DW 5
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 168
+ DW 4
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 152
+ DW 3
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 136
+ DW 2
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 120
+ DW 1
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 1
+ DW 20
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 192
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 112
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+ DB 96
+
+ DW 0
+$L$SEH_info_gcm_ghash_vpclmulqdq_avx10_512_0:
+ DB 1
+ DB $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx10_512_11-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+ DB 18
+ DB 0
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_10-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+ DB 216
+ DW 7
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_9-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+ DB 200
+ DW 6
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_8-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+ DB 184
+ DW 5
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+ DB 168
+ DW 4
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+ DB 152
+ DW 3
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+ DB 136
+ DW 2
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+ DB 120
+ DW 1
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+ DB 1
+ DW 17
+
+$L$SEH_info_aes_gcm_enc_update_vaes_avx10_512_0:
+ DB 1
+ DB $L$SEH_endprologue_aes_gcm_enc_update_vaes_avx10_512_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 25
+ DB 0
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 248
+ DW 9
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 232
+ DW 8
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 216
+ DW 7
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 200
+ DW 6
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 184
+ DW 5
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 168
+ DW 4
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 152
+ DW 3
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 136
+ DW 2
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 120
+ DW 1
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 1
+ DW 20
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 192
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 112
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+ DB 96
+
+ DW 0
+$L$SEH_info_aes_gcm_dec_update_vaes_avx10_512_0:
+ DB 1
+ DB $L$SEH_endprologue_aes_gcm_dec_update_vaes_avx10_512_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 25
+ DB 0
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 248
+ DW 9
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 232
+ DW 8
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 216
+ DW 7
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 200
+ DW 6
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 184
+ DW 5
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 168
+ DW 4
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 152
+ DW 3
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 136
+ DW 2
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 120
+ DW 1
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 1
+ DW 20
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 192
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 112
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+ DB 96
+
+ DW 0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/sources.bzl b/gen/sources.bzl
index b258d42..9fa3e11 100644
--- a/gen/sources.bzl
+++ b/gen/sources.bzl
@@ -94,6 +94,8 @@
]
bcm_sources_asm = [
+ "gen/bcm/aes-gcm-avx10-x86_64-apple.S",
+ "gen/bcm/aes-gcm-avx10-x86_64-linux.S",
"gen/bcm/aesni-gcm-x86_64-apple.S",
"gen/bcm/aesni-gcm-x86_64-linux.S",
"gen/bcm/aesni-x86-apple.S",
@@ -192,6 +194,7 @@
]
bcm_sources_nasm = [
+ "gen/bcm/aes-gcm-avx10-x86_64-win.asm",
"gen/bcm/aesni-gcm-x86_64-win.asm",
"gen/bcm/aesni-x86-win.asm",
"gen/bcm/aesni-x86_64-win.asm",
diff --git a/gen/sources.cmake b/gen/sources.cmake
index 6d1c32f..3d79734 100644
--- a/gen/sources.cmake
+++ b/gen/sources.cmake
@@ -100,6 +100,8 @@
set(
BCM_SOURCES_ASM
+ gen/bcm/aes-gcm-avx10-x86_64-apple.S
+ gen/bcm/aes-gcm-avx10-x86_64-linux.S
gen/bcm/aesni-gcm-x86_64-apple.S
gen/bcm/aesni-gcm-x86_64-linux.S
gen/bcm/aesni-x86-apple.S
@@ -200,6 +202,7 @@
set(
BCM_SOURCES_NASM
+ gen/bcm/aes-gcm-avx10-x86_64-win.asm
gen/bcm/aesni-gcm-x86_64-win.asm
gen/bcm/aesni-x86-win.asm
gen/bcm/aesni-x86_64-win.asm
diff --git a/gen/sources.gni b/gen/sources.gni
index 3b72a79..0abc62b 100644
--- a/gen/sources.gni
+++ b/gen/sources.gni
@@ -94,6 +94,8 @@
]
bcm_sources_asm = [
+ "gen/bcm/aes-gcm-avx10-x86_64-apple.S",
+ "gen/bcm/aes-gcm-avx10-x86_64-linux.S",
"gen/bcm/aesni-gcm-x86_64-apple.S",
"gen/bcm/aesni-gcm-x86_64-linux.S",
"gen/bcm/aesni-x86-apple.S",
@@ -192,6 +194,7 @@
]
bcm_sources_nasm = [
+ "gen/bcm/aes-gcm-avx10-x86_64-win.asm",
"gen/bcm/aesni-gcm-x86_64-win.asm",
"gen/bcm/aesni-x86-win.asm",
"gen/bcm/aesni-x86_64-win.asm",
diff --git a/gen/sources.json b/gen/sources.json
index ded253d..6afbc27 100644
--- a/gen/sources.json
+++ b/gen/sources.json
@@ -78,6 +78,8 @@
"crypto/fipsmodule/tls/kdf.cc.inc"
],
"asm": [
+ "gen/bcm/aes-gcm-avx10-x86_64-apple.S",
+ "gen/bcm/aes-gcm-avx10-x86_64-linux.S",
"gen/bcm/aesni-gcm-x86_64-apple.S",
"gen/bcm/aesni-gcm-x86_64-linux.S",
"gen/bcm/aesni-x86-apple.S",
@@ -175,6 +177,7 @@
"third_party/fiat/asm/fiat_p256_adx_sqr.S"
],
"nasm": [
+ "gen/bcm/aes-gcm-avx10-x86_64-win.asm",
"gen/bcm/aesni-gcm-x86_64-win.asm",
"gen/bcm/aesni-x86-win.asm",
"gen/bcm/aesni-x86_64-win.asm",