Add VAES and VPCLMULQDQ accelerated AES-GCM

Add an AES-GCM implementation for x86_64 that uses VAES, VPCLMULQDQ, and
either AVX10 or a compatible AVX512 feature set.  The assembly code is
based on the code I wrote for the Linux kernel
(https://git.kernel.org/linus/b06affb1cb580e13).  Some substantial
changes were needed for BoringSSL integration; see the file comment.

The following tables compare the performance of AES-256-GCM before and
after this patch, and also versus the alternative patch from Cloudflare
(https://boringssl-review.googlesource.com/c/boringssl/+/65987/3).  All
tables show throughput in MB/s, for implementation name vs. message
length in bytes.  All benchmarks were done using EVP_AEAD_CTX_seal() and
EVP_AEAD_CTX_open() with an associated data length of 16 bytes.

AMD Zen 5, Granite Ridge (encryption):

               | 16384 |  4096 |  4095 |  1420 |   512 |   500 |
    -----------+-------+-------+-------+-------+-------+-------+
    This patch | 26358 | 21295 | 17402 | 10672 |  7798 |  4840 |
    Cloudflare | 22363 | 18330 | 17008 | 10979 |  7070 |  5870 |
    Existing   |  7194 |  6743 |  6465 |  5404 |  4075 |  3563 |

               |   300 |   200 |    64 |    63 |    16 |
    -----------+-------+-------+-------+-------+-------+
    This patch |  3248 |  2557 |  1359 |   937 |   537 |
    Cloudflare |  3624 |  2770 |  1293 |  1028 |   517 |
    Existing   |  2938 |  2271 |  1266 |   959 |   528 |

AMD Zen 5, Granite Ridge (decryption):

               | 16384 |  4096 |  4095 |  1420 |   512 |   500 |
    -----------+-------+-------+-------+-------+-------+-------+
    This patch | 27214 | 22298 | 18824 | 11401 |  8496 |  5399 |
    Cloudflare | 22629 | 19257 | 17792 | 11575 |  7807 |  6031 |
    Existing   |  7122 |  6805 |  6228 |  4922 |  4604 |  3565 |

               |   300 |   200 |    64 |    63 |    16 |
    -----------+-------+-------+-------+-------+-------+
    This patch |  3637 |  2497 |  1483 |   952 |   589 |
    Cloudflare |  3714 |  2847 |  1437 |  1030 |   567 |
    Existing   |  3012 |  2354 |  1514 |   880 |   632 |

AMD Zen 4, Genoa (encryption):

               | 16384 |  4096 |  4095 |  1420 |   512 |   500 |
    -----------+-------+-------+-------+-------+-------+-------+
    This patch | 10093 |  8907 |  7614 |  5399 |  4247 |  2719 |
    Cloudflare |  9174 |  8073 |  7521 |  5414 |  3786 |  3111 |
    Existing   |  4239 |  3964 |  3800 |  3186 |  2398 |  2069 |

               |   300 |   200 |    64 |    63 |    16 |
    -----------+-------+-------+-------+-------+-------+
    This patch |  1940 |  1553 |   851 |   581 |   343 |
    Cloudflare |  2023 |  1619 |   775 |   619 |   311 |
    Existing   |  1735 |  1334 |   775 |   573 |   317 |

AMD Zen 4, Genoa (decryption):

               | 16384 |  4096 |  4095 |  1420 |   512 |   500 |
    -----------+-------+-------+-------+-------+-------+-------+
    This patch | 10108 |  8922 |  7879 |  5526 |  4250 |  2872 |
    Cloudflare |  9441 |  8347 |  7723 |  5366 |  3902 |  3067 |
    Existing   |  4249 |  3999 |  3810 |  3101 |  2535 |  2026 |

               |   300 |   200 |    64 |    63 |    16 |
    -----------+-------+-------+-------+-------+-------+
    This patch |  2031 |  1536 |   868 |   568 |   346 |
    Cloudflare |  1933 |  1579 |   765 |   569 |   300 |
    Existing   |  1723 |  1381 |   806 |   516 |   345 |

Intel Emerald Rapids (encryption):

               | 16384 |  4096 |  4095 |  1420 |   512 |   500 |
    -----------+-------+-------+-------+-------+-------+-------+
    This patch | 13974 | 11827 | 10166 |  6601 |  4904 |  3334 |
    Cloudflare | 12735 | 10752 |  9966 |  6709 |  4524 |  3647 |
    Existing   |  5237 |  4831 |  4639 |  3747 |  2816 |  2409 |

               |   300 |   200 |    64 |    63 |    16 |
    -----------+-------+-------+-------+-------+-------+
    This patch |  2251 |  1763 |   915 |   649 |   363 |
    Cloudflare |  2329 |  1850 |   855 |   676 |   342 |
    Existing   |  1971 |  1502 |   808 |   626 |   359 |

Intel Emerald Rapids (decryption):

               | 16384 |  4096 |  4095 |  1420 |   512 |   500 |
    -----------+-------+-------+-------+-------+-------+-------+
    This patch | 14239 | 12180 | 10370 |  6692 |  5305 |  3344 |
    Cloudflare | 13348 | 11485 | 10460 |  6736 |  5229 |  3641 |
    Existing   |  5306 |  4958 |  4702 |  3767 |  3071 |  2432 |

               |   300 |   200 |    64 |    63 |    16 |
    -----------+-------+-------+-------+-------+-------+
    This patch |  2197 |  2077 |  1040 |   628 |   390 |
    Cloudflare |  2186 |  1911 |   938 |   615 |   370 |
    Existing   |  2024 |  1727 |   999 |   599 |   421 |

Intel Sapphire Rapids (encryption):

               | 16384 |  4096 |  4095 |  1420 |   512 |   500 |
    -----------+-------+-------+-------+-------+-------+-------+
    This patch | 12726 | 10618 |  9248 |  6012 |  4466 |  2986 |
    Cloudflare | 11059 |  9794 |  9071 |  6052 |  4089 |  3306 |
    Existing   |  4761 |  4397 |  4222 |  3408 |  2560 |  2188 |

               |   300 |   200 |    64 |    63 |    16 |
    -----------+-------+-------+-------+-------+-------+
    This patch |  2051 |  1612 |   838 |   579 |   351 |
    Cloudflare |  2110 |  1686 |   775 |   622 |   311 |
    Existing   |  1792 |  1369 |   733 |   567 |   324 |

Intel Sapphire Rapids (decryption):

               | 16384 |  4096 |  4095 |  1420 |   512 |   500 |
    -----------+-------+-------+-------+-------+-------+-------+
    This patch | 12951 | 11100 |  9447 |  6067 |  4862 |  3030 |
    Cloudflare | 12165 | 10421 |  9506 |  6126 |  4767 |  3321 |
    Existing   |  4807 |  4507 |  4275 |  3400 |  2791 |  2216 |

               |   300 |   200 |    64 |    63 |    16 |
    -----------+-------+-------+-------+-------+-------+
    This patch |  2003 |  1894 |   950 |   572 |   357 |
    Cloudflare |  1999 |  1741 |   857 |   559 |   328 |
    Existing   |  1831 |  1571 |   838 |   539 |   382 |

Change-Id: I5b0833d2ffe8fd273cb38a26cd104c52c3532ceb
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/70187
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/build.json b/build.json
index 04d9bf9..9a667f3 100644
--- a/build.json
+++ b/build.json
@@ -132,6 +132,7 @@
         ],
         "perlasm_x86_64": [
             {"src": "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl"},
+            {"src": "crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl"},
             {"src": "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"},
             {"src": "crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl"},
             {"src": "crypto/fipsmodule/modes/asm/ghash-x86_64.pl"},
diff --git a/crypto/crypto.cc b/crypto/crypto.cc
index ead0543..9f8eba4 100644
--- a/crypto/crypto.cc
+++ b/crypto/crypto.cc
@@ -54,7 +54,7 @@
 // archive, linking on OS X will fail to resolve common symbols. By
 // initialising it to zero, it becomes a "data symbol", which isn't so
 // affected.
-HIDDEN uint8_t BORINGSSL_function_hit[7] = {0};
+HIDDEN uint8_t BORINGSSL_function_hit[8] = {0};
 #endif
 
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
diff --git a/crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl
new file mode 100644
index 0000000..b65dee9
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl
@@ -0,0 +1,1358 @@
+#!/usr/bin/env perl
+# Copyright 2024 Google LLC
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+#------------------------------------------------------------------------------
+#
+# VAES and VPCLMULQDQ optimized AES-GCM for x86_64
+#
+# This file is based on aes-gcm-avx10-x86_64.S from the Linux kernel
+# (https://git.kernel.org/linus/b06affb1cb580e13).  The following notable
+# changes have been made:
+#
+# - Relicensed under BoringSSL's preferred license.
+#
+# - Converted from GNU assembler to "perlasm".  This was necessary for
+#   compatibility with BoringSSL's Windows builds which use NASM instead of the
+#   GNU assembler.  It was also necessary for compatibility with the 'delocate'
+#   tool used in BoringSSL's FIPS builds.
+#
+# - Added support for the Windows ABI.
+#
+# - Changed function prototypes to be compatible with what BoringSSL wants.
+#
+# - Removed the optimized finalization function, as BoringSSL doesn't want it.
+#
+# - Added a single-block GHASH multiplication function, as BoringSSL needs this.
+#
+# - Added optimization for large amounts of AAD.
+#
+#------------------------------------------------------------------------------
+#
+# This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
+# support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and
+# either AVX512 or AVX10.  Some of the functions, notably the encryption and
+# decryption update functions which are the most performance-critical, are
+# provided in two variants generated from a macro: one using 256-bit vectors
+# (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512).  The
+# other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
+#
+# The functions that use 512-bit vectors are intended for CPUs that support
+# 512-bit vectors *and* where using them doesn't cause significant
+# downclocking.  They require the following CPU features:
+#
+#       VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512)
+#
+# The other functions require the following CPU features:
+#
+#       VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256)
+#
+# Note that we use "avx10" in the names of the functions as a shorthand to
+# really mean "AVX10 or a certain set of AVX512 features".  Due to Intel's
+# introduction of AVX512 and then its replacement by AVX10, there doesn't seem
+# to be a simple way to name things that makes sense on all CPUs.
+#
+# Note that the macros that support both 256-bit and 512-bit vectors could
+# fairly easily be changed to support 128-bit too.  However, this would *not*
+# be sufficient to allow the code to run on CPUs without AVX512 or AVX10,
+# because the code heavily uses several features of these extensions other than
+# the vector length: the increase in the number of SIMD registers from 16 to
+# 32, masking support, and new instructions such as vpternlogd (which can do a
+# three-argument XOR).  These features are very useful for AES-GCM.
+
+$flavour = shift;
+$output  = shift;
+if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; }
+
+if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) {
+    $win64   = 1;
+    @argregs = ( "%rcx", "%rdx", "%r8", "%r9" );
+}
+else {
+    $win64   = 0;
+    @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" );
+}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+$dir = $1;
+( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate )
+  or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate )
+  or die "can't locate x86_64-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+sub _begin_func {
+    my ( $funcname, $uses_seh ) = @_;
+    $g_cur_func_name          = $funcname;
+    $g_cur_func_uses_seh      = $uses_seh;
+    @g_cur_func_saved_gpregs  = ();
+    @g_cur_func_saved_xmmregs = ();
+    return <<___;
+.globl $funcname
+.type $funcname,\@abi-omnipotent
+.align 32
+$funcname:
+    .cfi_startproc
+    @{[ $uses_seh ? ".seh_startproc" : "" ]}
+    _CET_ENDBR
+___
+}
+
+# Push a list of general purpose registers onto the stack.
+sub _save_gpregs {
+    my @gpregs = @_;
+    my $code   = "";
+    die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh;
+    die "_save_gpregs can only be called once per function"
+      if @g_cur_func_saved_gpregs;
+    die "Order must be _save_gpregs, then _save_xmmregs"
+      if @g_cur_func_saved_xmmregs;
+    @g_cur_func_saved_gpregs = @gpregs;
+    for my $reg (@gpregs) {
+        $code .= "push $reg\n";
+        if ($win64) {
+            $code .= ".seh_pushreg $reg\n";
+        }
+        else {
+            $code .= ".cfi_push $reg\n";
+        }
+    }
+    return $code;
+}
+
+# Push a list of xmm registers onto the stack if the target is Windows.
+sub _save_xmmregs {
+    my @xmmregs     = @_;
+    my $num_xmmregs = scalar @xmmregs;
+    my $code        = "";
+    die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh;
+    die "_save_xmmregs can only be called once per function"
+      if @g_cur_func_saved_xmmregs;
+    if ( $win64 and $num_xmmregs > 0 ) {
+        @g_cur_func_saved_xmmregs = @xmmregs;
+        my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
+        my $alloc_size    = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 );
+        $code .= "sub \$$alloc_size, %rsp\n";
+        $code .= ".seh_stackalloc $alloc_size\n";
+        for my $i ( 0 .. $num_xmmregs - 1 ) {
+            my $reg_num = $xmmregs[$i];
+            my $pos     = 16 * $i;
+            $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n";
+            $code .= ".seh_savexmm %xmm$reg_num, $pos\n";
+        }
+    }
+    return $code;
+}
+
+sub _end_func {
+    my $code = "";
+
+    # Restore any xmm registers that were saved earlier.
+    my $num_xmmregs = scalar @g_cur_func_saved_xmmregs;
+    if ( $win64 and $num_xmmregs > 0 ) {
+        my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
+        my $alloc_size     = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 );
+        for my $i ( 0 .. $num_xmmregs - 1 ) {
+            my $reg_num = $g_cur_func_saved_xmmregs[$i];
+            my $pos     = 16 * $i;
+            $code .= "movdqa $pos(%rsp), %xmm$reg_num\n";
+        }
+        $code .= "add $alloc_size, %rsp\n";
+    }
+
+    # Restore any general purpose registers that were saved earlier.
+    for my $reg ( reverse @g_cur_func_saved_gpregs ) {
+        $code .= "pop $reg\n";
+        if ( !$win64 ) {
+            $code .= ".cfi_pop $reg\n";
+        }
+    }
+
+    $code .= <<___;
+    ret
+    @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]}
+    .cfi_endproc
+    .size   $g_cur_func_name, . - $g_cur_func_name
+___
+    return $code;
+}
+
+$code = <<___;
+.section .rodata
+.align 64
+
+    # A shuffle mask that reflects the bytes of 16-byte blocks
+.Lbswap_mask:
+    .quad   0x08090a0b0c0d0e0f, 0x0001020304050607
+
+    # This is the GHASH reducing polynomial without its constant term, i.e.
+    # x^128 + x^7 + x^2 + x, represented using the backwards mapping
+    # between bits and polynomial coefficients.
+    #
+    # Alternatively, it can be interpreted as the naturally-ordered
+    # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+    # "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+    .quad   1, 0xc200000000000000
+
+    # Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+    .quad   1, 0xc200000000000001
+
+    # The below constants are used for incrementing the counter blocks.
+    # ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
+    # inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
+    # 4.  Note that the same '2' is reused in ctr_pattern and inc_2blocks.
+.Lctr_pattern:
+    .quad   0, 0
+    .quad   1, 0
+.Linc_2blocks:
+    .quad   2, 0
+    .quad   3, 0
+.Linc_4blocks:
+    .quad   4, 0
+
+.text
+___
+
+# Number of powers of the hash key stored in the key struct.  The powers are
+# stored from highest (H^NUM_H_POWERS) to lowest (H^1).
+$NUM_H_POWERS = 16;
+
+$OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16;
+
+# Offset to 'rounds' in AES_KEY struct
+$OFFSETOF_AES_ROUNDS = 240;
+
+# The current vector length in bytes
+undef $VL;
+
+# Set the vector length in bytes.  This sets the VL variable and defines
+# register aliases V0-V31 that map to the ymm or zmm registers.
+sub _set_veclen {
+    ($VL) = @_;
+    foreach my $i ( 0 .. 31 ) {
+        if ( $VL == 32 ) {
+            ${"V${i}"} = "%ymm${i}";
+        }
+        elsif ( $VL == 64 ) {
+            ${"V${i}"} = "%zmm${i}";
+        }
+        else {
+            die "Unsupported vector length";
+        }
+    }
+}
+
+# The _ghash_mul_step macro does one step of GHASH multiplication of the
+# 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
+# reduced products in \dst.  \t0, \t1, and \t2 are temporary registers of the
+# same size as \a and \b.  To complete all steps, this must invoked with \i=0
+# through \i=9.  The division into steps allows users of this macro to
+# optionally interleave the computation with other instructions.  Users of this
+# macro must preserve the parameter registers across steps.
+#
+# The multiplications are done in GHASH's representation of the finite field
+# GF(2^128).  Elements of GF(2^128) are represented as binary polynomials
+# (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial
+# G.  The GCM specification uses G = x^128 + x^7 + x^2 + x + 1.  Addition is
+# just XOR, while multiplication is more complex and has two parts: (a) do
+# carryless multiplication of two 128-bit input polynomials to get a 256-bit
+# intermediate product polynomial, and (b) reduce the intermediate product to
+# 128 bits by adding multiples of G that cancel out terms in it.  (Adding
+# multiples of G doesn't change which field element the polynomial represents.)
+#
+# Unfortunately, the GCM specification maps bits to/from polynomial
+# coefficients backwards from the natural order.  In each byte it specifies the
+# highest bit to be the lowest order polynomial coefficient, *not* the highest!
+# This makes it nontrivial to work with the GHASH polynomials.  We could
+# reflect the bits, but x86 doesn't have an instruction that does that.
+#
+# Instead, we operate on the values without bit-reflecting them.  This *mostly*
+# just works, since XOR and carryless multiplication are symmetric with respect
+# to bit order, but it has some consequences.  First, due to GHASH's byte
+# order, by skipping bit reflection, *byte* reflection becomes necessary to
+# give the polynomial terms a consistent order.  E.g., considering an N-bit
+# value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0
+# through N-1 of the byte-reflected value represent the coefficients of x^(N-1)
+# through x^0, whereas bits 0 through N-1 of the non-byte-reflected value
+# represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked
+# with.  Fortunately, x86's vpshufb instruction can do byte reflection.
+#
+# Second, forgoing the bit reflection causes an extra multiple of x (still
+# using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each
+# multiplication.  This is because an M-bit by N-bit carryless multiplication
+# really produces a (M+N-1)-bit product, but in practice it's zero-extended to
+# M+N bits.  In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits
+# to polynomial coefficients backwards, this zero-extension actually changes
+# the product by introducing an extra factor of x.  Therefore, users of this
+# macro must ensure that one of the inputs has an extra factor of x^-1, i.e.
+# the multiplicative inverse of x, to cancel out the extra x.
+#
+# Third, the backwards coefficients convention is just confusing to work with,
+# since it makes "low" and "high" in the polynomial math mean the opposite of
+# their normal meaning in computer programming.  This can be solved by using an
+# alternative interpretation: the polynomial coefficients are understood to be
+# in the natural order, and the multiplication is actually \a * \b * x^-128 mod
+# x^128 + x^127 + x^126 + x^121 + 1.  This doesn't change the inputs, outputs,
+# or the implementation at all; it just changes the mathematical interpretation
+# of what each instruction is doing.  Starting from here, we'll use this
+# alternative interpretation, as it's easier to understand the code that way.
+#
+# Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 =>
+# 128-bit carryless multiplication, so we break the 128 x 128 multiplication
+# into parts as follows (the _L and _H suffixes denote low and high 64 bits):
+#
+#     LO = a_L * b_L
+#     MI = (a_L * b_H) + (a_H * b_L)
+#     HI = a_H * b_H
+#
+# The 256-bit product is x^128*HI + x^64*MI + LO.  LO, MI, and HI are 128-bit.
+# Note that MI "overlaps" with LO and HI.  We don't consolidate MI into LO and
+# HI right away, since the way the reduction works makes that unnecessary.
+#
+# For the reduction, we cancel out the low 128 bits by adding multiples of G =
+# x^128 + x^127 + x^126 + x^121 + 1.  This is done by two iterations, each of
+# which cancels out the next lowest 64 bits.  Consider a value x^64*A + B,
+# where A and B are 128-bit.  Adding B_L*G to that value gives:
+#
+#       x^64*A + B + B_L*G
+#     = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1)
+#     = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L
+#     = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L
+#     = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57))
+#
+# So: if we sum A, B with its halves swapped, and the low half of B times x^63
+# + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
+# original value x^64*A + B.  I.e., the low 64 bits got canceled out.
+#
+# We just need to apply this twice: first to fold LO into MI, and second to
+# fold the updated MI into HI.
+#
+# The needed three-argument XORs are done using the vpternlogd instruction with
+# immediate 0x96, since this is faster than two vpxord instructions.
+#
+# A potential optimization, assuming that b is fixed per-key (if a is fixed
+# per-key it would work the other way around), is to use one iteration of the
+# reduction described above to precompute a value c such that x^64*c = b mod G,
+# and then multiply a_L by c (and implicitly by x^64) instead of by b:
+#
+#     MI = (a_L * c_L) + (a_H * b_L)
+#     HI = (a_L * c_H) + (a_H * b_H)
+#
+# This would eliminate the LO part of the intermediate product, which would
+# eliminate the need to fold LO into MI.  This would save two instructions,
+# including a vpclmulqdq.  However, we currently don't use this optimization
+# because it would require twice as many per-key precomputed values.
+#
+# Using Karatsuba multiplication instead of "schoolbook" multiplication
+# similarly would save a vpclmulqdq but does not seem to be worth it.
+sub _ghash_mul_step {
+    my ( $i, $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
+    if ( $i == 0 ) {
+        return "vpclmulqdq \$0x00, $a, $b, $t0\n" .    # LO = a_L * b_L
+          "vpclmulqdq \$0x01, $a, $b, $t1\n";          # MI_0 = a_L * b_H
+    }
+    elsif ( $i == 1 ) {
+        return "vpclmulqdq \$0x10, $a, $b, $t2\n";     # MI_1 = a_H * b_L
+    }
+    elsif ( $i == 2 ) {
+        return "vpxord $t2, $t1, $t1\n";               # MI = MI_0 + MI_1
+    }
+    elsif ( $i == 3 ) {
+        return
+          "vpclmulqdq \$0x01, $t0, $gfpoly, $t2\n";  # LO_L*(x^63 + x^62 + x^57)
+    }
+    elsif ( $i == 4 ) {
+        return "vpshufd \$0x4e, $t0, $t0\n";         # Swap halves of LO
+    }
+    elsif ( $i == 5 ) {
+        return "vpternlogd \$0x96, $t2, $t0, $t1\n";    # Fold LO into MI
+    }
+    elsif ( $i == 6 ) {
+        return "vpclmulqdq \$0x11, $a, $b, $dst\n";     # HI = a_H * b_H
+    }
+    elsif ( $i == 7 ) {
+        return
+          "vpclmulqdq \$0x01, $t1, $gfpoly, $t0\n";  # MI_L*(x^63 + x^62 + x^57)
+    }
+    elsif ( $i == 8 ) {
+        return "vpshufd \$0x4e, $t1, $t1\n";         # Swap halves of MI
+    }
+    elsif ( $i == 9 ) {
+        return "vpternlogd \$0x96, $t0, $t1, $dst\n";    # Fold MI into HI
+    }
+}
+
+# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+# the reduced products in \dst.  See _ghash_mul_step for full explanation.
+sub _ghash_mul {
+    my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
+    my $code = "";
+    for my $i ( 0 .. 9 ) {
+        $code .= _ghash_mul_step $i, $a, $b, $dst, $gfpoly, $t0, $t1, $t2;
+    }
+    return $code;
+}
+
+# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
+# *unreduced* products to \lo, \mi, and \hi.
+sub _ghash_mul_noreduce {
+    my ( $a, $b, $lo, $mi, $hi, $t0, $t1, $t2, $t3 ) = @_;
+    return <<___;
+    vpclmulqdq      \$0x00, $a, $b, $t0      # a_L * b_L
+    vpclmulqdq      \$0x01, $a, $b, $t1      # a_L * b_H
+    vpclmulqdq      \$0x10, $a, $b, $t2      # a_H * b_L
+    vpclmulqdq      \$0x11, $a, $b, $t3      # a_H * b_H
+    vpxord          $t0, $lo, $lo
+    vpternlogd      \$0x96, $t2, $t1, $mi
+    vpxord          $t3, $hi, $hi
+___
+}
+
+# Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
+# reduced products in \hi.  See _ghash_mul_step for explanation of reduction.
+sub _ghash_reduce {
+    my ( $lo, $mi, $hi, $gfpoly, $t0 ) = @_;
+    return <<___;
+    vpclmulqdq      \$0x01, $lo, $gfpoly, $t0
+    vpshufd         \$0x4e, $lo, $lo
+    vpternlogd      \$0x96, $t0, $lo, $mi
+    vpclmulqdq      \$0x01, $mi, $gfpoly, $t0
+    vpshufd         \$0x4e, $mi, $mi
+    vpternlogd      \$0x96, $t0, $mi, $hi
+___
+}
+
+$g_init_macro_expansion_count = 0;
+
+# void gcm_init_##suffix(u128 Htable[16], const uint64_t H[2]);
+#
+# Initialize |Htable| with powers of the GHASH subkey |H|.
+#
+# The powers are stored in the order H^NUM_H_POWERS to H^1.
+#
+# This macro supports both VL=32 and VL=64.  _set_veclen must have been invoked
+# with the desired length.  In the VL=32 case, the function computes twice as
+# many key powers than are actually used by the VL=32 GCM update functions.
+# This is done to keep the key format the same regardless of vector length.
+sub _aes_gcm_init {
+    my $local_label_suffix = "__func" . ++$g_init_macro_expansion_count;
+
+    # Function arguments
+    my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ];
+
+    # Additional local variables.  V0-V2 and %rax are used as temporaries.
+    my $POWERS_PTR     = "%r8";
+    my $RNDKEYLAST_PTR = "%r9";
+    my ( $H_CUR, $H_CUR_YMM, $H_CUR_XMM )    = ( "$V3", "%ymm3", "%xmm3" );
+    my ( $H_INC, $H_INC_YMM, $H_INC_XMM )    = ( "$V4", "%ymm4", "%xmm4" );
+    my ( $GFPOLY, $GFPOLY_YMM, $GFPOLY_XMM ) = ( "$V5", "%ymm5", "%xmm5" );
+
+    my $code = <<___;
+    # Get pointer to lowest set of key powers (located at end of array).
+    lea             $OFFSETOFEND_H_POWERS-$VL($HTABLE), $POWERS_PTR
+
+    # Load the byte-reflected hash subkey.  BoringSSL provides it in
+    # byte-reflected form except the two halves are in the wrong order.
+    vpshufd         \$0x4e, ($H_PTR), $H_CUR_XMM
+
+    # Finish preprocessing the first key power, H^1.  Since this GHASH
+    # implementation operates directly on values with the backwards bit
+    # order specified by the GCM standard, it's necessary to preprocess the
+    # raw key as follows.  First, reflect its bytes.  Second, multiply it
+    # by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
+    # interpretation of polynomial coefficients), which can also be
+    # interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121
+    # + 1 using the alternative, natural interpretation of polynomial
+    # coefficients.  For details, see the comment above _ghash_mul_step.
+    #
+    # Either way, for the multiplication the concrete operation performed
+    # is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
+    # << 120) | 1 if a 1 bit was carried out.  However, there's no 128-bit
+    # wide shift instruction, so instead double each of the two 64-bit
+    # halves and incorporate the internal carry bit into the value XOR'd.
+    vpshufd         \$0xd3, $H_CUR_XMM, %xmm0
+    vpsrad          \$31, %xmm0, %xmm0
+    vpaddq          $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM
+    # H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit
+    vpternlogd      \$0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, $H_CUR_XMM
+
+    # Load the gfpoly constant.
+    vbroadcasti32x4 .Lgfpoly(%rip), $GFPOLY
+
+    # Square H^1 to get H^2.
+    #
+    # Note that as with H^1, all higher key powers also need an extra
+    # factor of x^-1 (or x using the natural interpretation).  Nothing
+    # special needs to be done to make this happen, though: H^1 * H^1 would
+    # end up with two factors of x^-1, but the multiplication consumes one.
+    # So the product H^2 ends up with the desired one factor of x^-1.
+    @{[ _ghash_mul  $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM,
+                    "%xmm0", "%xmm1", "%xmm2" ]}
+
+    # Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
+    vinserti128     \$1, $H_CUR_XMM, $H_INC_YMM, $H_CUR_YMM
+    vinserti128     \$1, $H_INC_XMM, $H_INC_YMM, $H_INC_YMM
+___
+
+    if ( $VL == 64 ) {
+
+        # Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
+        $code .= <<___;
+        @{[ _ghash_mul  $H_INC_YMM, $H_CUR_YMM, $H_INC_YMM, $GFPOLY_YMM,
+                        "%ymm0", "%ymm1", "%ymm2" ]}
+        vinserti64x4    \$1, $H_CUR_YMM, $H_INC, $H_CUR
+        vshufi64x2      \$0, $H_INC, $H_INC, $H_INC
+___
+    }
+
+    $code .= <<___;
+    # Store the lowest set of key powers.
+    vmovdqu8        $H_CUR, ($POWERS_PTR)
+
+    # Compute and store the remaining key powers.  With VL=32, repeatedly
+    # multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
+    # With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+    # [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
+    mov             \$@{[ $NUM_H_POWERS*16/$VL - 1 ]}, %eax
+.Lprecompute_next$local_label_suffix:
+    sub             \$$VL, $POWERS_PTR
+    @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR, $GFPOLY, $V0, $V1, $V2 ]}
+    vmovdqu8        $H_CUR, ($POWERS_PTR)
+    dec             %eax
+    jnz             .Lprecompute_next$local_label_suffix
+
+    vzeroupper      # This is needed after using ymm or zmm registers.
+___
+    return $code;
+}
+
+# XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
+# the result in \dst_xmm.  This implicitly zeroizes the other lanes of dst.
+sub _horizontal_xor {
+    my ( $src, $src_xmm, $dst_xmm, $t0_xmm, $t1_xmm, $t2_xmm ) = @_;
+    if ( $VL == 32 ) {
+        return <<___;
+        vextracti32x4   \$1, $src, $t0_xmm
+        vpxord          $t0_xmm, $src_xmm, $dst_xmm
+___
+    }
+    elsif ( $VL == 64 ) {
+        return <<___;
+        vextracti32x4   \$1, $src, $t0_xmm
+        vextracti32x4   \$2, $src, $t1_xmm
+        vextracti32x4   \$3, $src, $t2_xmm
+        vpxord          $t0_xmm, $src_xmm, $dst_xmm
+        vpternlogd      \$0x96, $t1_xmm, $t2_xmm, $dst_xmm
+___
+    }
+    else {
+        die "Unsupported vector length";
+    }
+}
+
+# Do one step of the GHASH update of the data blocks given in the vector
+# registers GHASHDATA[0-3].  \i specifies the step to do, 0 through 9.  The
+# division into steps allows users of this macro to optionally interleave the
+# computation with other instructions.  This macro uses the vector register
+# GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered;
+# H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and
+# GHASHTMP[0-2] as temporaries.  This macro handles the byte-reflection of the
+# data blocks.  The parameter registers must be preserved across steps.
+#
+# The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
+# H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
+# operations are vectorized operations on vectors of 16-byte blocks.  E.g.,
+# with VL=32 there are 2 blocks per vector and the vectorized terms correspond
+# to the following non-vectorized terms:
+#
+#       H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
+#       H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
+#       H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
+#       H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
+#
+# With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
+#
+# More concretely, this code does:
+#   - Do vectorized "schoolbook" multiplications to compute the intermediate
+#     256-bit product of each block and its corresponding hash key power.
+#     There are 4*VL/16 of these intermediate products.
+#   - Sum (XOR) the intermediate 256-bit products across vectors.  This leaves
+#     VL/16 256-bit intermediate values.
+#   - Do a vectorized reduction of these 256-bit intermediate values to
+#     128-bits each.  This leaves VL/16 128-bit intermediate values.
+#   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
+#
+# See _ghash_mul_step for the full explanation of the operations performed for
+# each individual finite field multiplication and reduction.
+sub _ghash_step_4x {
+    my ($i) = @_;
+    if ( $i == 0 ) {
+        return <<___;
+        vpshufb         $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
+        vpxord          $GHASH_ACC, $GHASHDATA0, $GHASHDATA0
+        vpshufb         $BSWAP_MASK, $GHASHDATA1, $GHASHDATA1
+        vpshufb         $BSWAP_MASK, $GHASHDATA2, $GHASHDATA2
+___
+    }
+    elsif ( $i == 1 ) {
+        return <<___;
+        vpshufb         $BSWAP_MASK, $GHASHDATA3, $GHASHDATA3
+        vpclmulqdq      \$0x00, $H_POW4, $GHASHDATA0, $GHASH_ACC    # LO_0
+        vpclmulqdq      \$0x00, $H_POW3, $GHASHDATA1, $GHASHTMP0    # LO_1
+        vpclmulqdq      \$0x00, $H_POW2, $GHASHDATA2, $GHASHTMP1    # LO_2
+___
+    }
+    elsif ( $i == 2 ) {
+        return <<___;
+        vpxord          $GHASHTMP0, $GHASH_ACC, $GHASH_ACC          # sum(LO_{1,0})
+        vpclmulqdq      \$0x00, $H_POW1, $GHASHDATA3, $GHASHTMP2    # LO_3
+        vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASH_ACC  # LO = sum(LO_{3,2,1,0})
+        vpclmulqdq      \$0x01, $H_POW4, $GHASHDATA0, $GHASHTMP0    # MI_0
+___
+    }
+    elsif ( $i == 3 ) {
+        return <<___;
+        vpclmulqdq      \$0x01, $H_POW3, $GHASHDATA1, $GHASHTMP1    # MI_1
+        vpclmulqdq      \$0x01, $H_POW2, $GHASHDATA2, $GHASHTMP2    # MI_2
+        vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0  # sum(MI_{2,1,0})
+        vpclmulqdq      \$0x01, $H_POW1, $GHASHDATA3, $GHASHTMP1    # MI_3
+___
+    }
+    elsif ( $i == 4 ) {
+        return <<___;
+        vpclmulqdq      \$0x10, $H_POW4, $GHASHDATA0, $GHASHTMP2    # MI_4
+        vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0  # sum(MI_{4,3,2,1,0})
+        vpclmulqdq      \$0x10, $H_POW3, $GHASHDATA1, $GHASHTMP1    # MI_5
+        vpclmulqdq      \$0x10, $H_POW2, $GHASHDATA2, $GHASHTMP2    # MI_6
+___
+    }
+    elsif ( $i == 5 ) {
+        return <<___;
+        vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0  # sum(MI_{6,5,4,3,2,1,0})
+        vpclmulqdq      \$0x01, $GHASH_ACC, $GFPOLY, $GHASHTMP2     # LO_L*(x^63 + x^62 + x^57)
+        vpclmulqdq      \$0x10, $H_POW1, $GHASHDATA3, $GHASHTMP1    # MI_7
+        vpxord          $GHASHTMP1, $GHASHTMP0, $GHASHTMP0          # MI = sum(MI_{7,6,5,4,3,2,1,0})
+___
+    }
+    elsif ( $i == 6 ) {
+        return <<___;
+        vpshufd         \$0x4e, $GHASH_ACC, $GHASH_ACC              # Swap halves of LO
+        vpclmulqdq      \$0x11, $H_POW4, $GHASHDATA0, $GHASHDATA0   # HI_0
+        vpclmulqdq      \$0x11, $H_POW3, $GHASHDATA1, $GHASHDATA1   # HI_1
+        vpclmulqdq      \$0x11, $H_POW2, $GHASHDATA2, $GHASHDATA2   # HI_2
+___
+    }
+    elsif ( $i == 7 ) {
+        return <<___;
+        vpternlogd      \$0x96, $GHASHTMP2, $GHASH_ACC, $GHASHTMP0  # Fold LO into MI
+        vpclmulqdq      \$0x11, $H_POW1, $GHASHDATA3, $GHASHDATA3   # HI_3
+        vpternlogd      \$0x96, $GHASHDATA2, $GHASHDATA1, $GHASHDATA0 # sum(HI_{2,1,0})
+        vpclmulqdq      \$0x01, $GHASHTMP0, $GFPOLY, $GHASHTMP1     # MI_L*(x^63 + x^62 + x^57)
+___
+    }
+    elsif ( $i == 8 ) {
+        return <<___;
+        vpxord          $GHASHDATA3, $GHASHDATA0, $GHASH_ACC        # HI = sum(HI_{3,2,1,0})
+        vpshufd         \$0x4e, $GHASHTMP0, $GHASHTMP0              # Swap halves of MI
+        vpternlogd      \$0x96, $GHASHTMP1, $GHASHTMP0, $GHASH_ACC  # Fold MI into HI
+___
+    }
+    elsif ( $i == 9 ) {
+        return _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
+          $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM;
+    }
+}
+
+# Update GHASH with the blocks given in GHASHDATA[0-3].
+# See _ghash_step_4x for full explanation.
+sub _ghash_4x {
+    my $code = "";
+    for my $i ( 0 .. 9 ) {
+        $code .= _ghash_step_4x $i;
+    }
+    return $code;
+}
+
+$g_ghash_macro_expansion_count = 0;
+
+# void gcm_ghash_##suffix(uint8_t Xi[16], const u128 Htable[16],
+#                         const uint8_t *in, size_t len);
+#
+# This macro generates the body of a GHASH update function with the above
+# prototype.  This macro supports both VL=32 and VL=64.  _set_veclen must have
+# been invoked with the desired length.
+#
+# The generated function processes the AAD (Additional Authenticated Data) in
+# GCM.  Using the key |Htable|, it updates the GHASH accumulator |Xi| with the
+# data given by |in| and |len|.  On the first call, |Xi| must be all zeroes.
+# |len| must be a multiple of 16.
+#
+# This function handles large amounts of AAD efficiently, while also keeping the
+# overhead low for small amounts of AAD which is the common case.  TLS uses less
+# than one block of AAD, but (uncommonly) other use cases may use much more.
+sub _ghash_update {
+    my $local_label_suffix = "__func" . ++$g_ghash_macro_expansion_count;
+    my $code               = "";
+
+    # Function arguments
+    my ( $GHASH_ACC_PTR, $H_POWERS, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
+
+    # Additional local variables
+    ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V0, "%xmm0" );
+    ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V1, "%xmm1" );
+    ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V2, "%xmm2" );
+    ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V3, "%xmm3" );
+    ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V4, "%xmm4" );
+    ( $GHASH_ACC,  $GHASH_ACC_XMM )  = ( $V5, "%xmm5" );
+    ( $H_POW4, $H_POW3, $H_POW2 )          = ( $V6, $V7, $V8 );
+    ( $H_POW1, $H_POW1_XMM )               = ( $V9, "%xmm9" );
+    ( $GFPOLY, $GFPOLY_XMM )               = ( $V10, "%xmm10" );
+    ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V11, $V12, $V13 );
+
+    $code .= <<___;
+    @{[ _save_xmmregs (6 .. 13) ]}
+    .seh_endprologue
+
+    # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small,
+    # usually only 128-bit vectors will be used.  So as an optimization, don't
+    # broadcast these constants to all 128-bit lanes quite yet.
+    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK_XMM
+    vmovdqu         .Lgfpoly(%rip), $GFPOLY_XMM
+
+    # Load the GHASH accumulator.
+    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+
+    # Optimize for AADLEN < VL by checking for AADLEN < VL before AADLEN < 4*VL.
+    cmp             \$$VL, $AADLEN
+    jb              .Laad_blockbyblock$local_label_suffix
+
+    # AADLEN >= VL, so we'll operate on full vectors.  Broadcast bswap_mask and
+    # gfpoly to all 128-bit lanes.
+    vshufi64x2      \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
+    vshufi64x2      \$0, $GFPOLY, $GFPOLY, $GFPOLY
+
+    # Load the lowest set of key powers.
+    vmovdqu8        $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1
+
+    cmp             \$4*$VL-1, $AADLEN
+    jbe             .Laad_loop_1x$local_label_suffix
+
+    # AADLEN >= 4*VL.  Load the higher key powers.
+    vmovdqu8        $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4
+    vmovdqu8        $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3
+    vmovdqu8        $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2
+
+    # Update GHASH with 4*VL bytes of AAD at a time.
+.Laad_loop_4x$local_label_suffix:
+    vmovdqu8        0*$VL($AAD), $GHASHDATA0
+    vmovdqu8        1*$VL($AAD), $GHASHDATA1
+    vmovdqu8        2*$VL($AAD), $GHASHDATA2
+    vmovdqu8        3*$VL($AAD), $GHASHDATA3
+    @{[ _ghash_4x ]}
+    sub             \$-4*$VL, $AAD  # shorter than 'add 4*VL' when VL=32
+    add             \$-4*$VL, $AADLEN
+    cmp             \$4*$VL-1, $AADLEN
+    ja              .Laad_loop_4x$local_label_suffix
+
+    # Update GHASH with VL bytes of AAD at a time.
+    cmp             \$$VL, $AADLEN
+    jb              .Laad_large_done$local_label_suffix
+.Laad_loop_1x$local_label_suffix:
+    vmovdqu8        ($AAD), $GHASHDATA0
+    vpshufb         $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
+    vpxord          $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
+    @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY,
+                    $GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
+    @{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
+                        $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
+    add             \$$VL, $AAD
+    sub             \$$VL, $AADLEN
+    cmp             \$$VL, $AADLEN
+    jae             .Laad_loop_1x$local_label_suffix
+
+.Laad_large_done$local_label_suffix:
+    # Issue the vzeroupper that is needed after using ymm or zmm registers.
+    # Do it here instead of at the end, to minimize overhead for small AADLEN.
+    vzeroupper
+
+    # GHASH the remaining data 16 bytes at a time, using xmm registers only.
+.Laad_blockbyblock$local_label_suffix:
+    test            $AADLEN, $AADLEN
+    jz              .Laad_done$local_label_suffix
+    vmovdqu         $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1_XMM
+.Laad_loop_blockbyblock$local_label_suffix:
+    vmovdqu         ($AAD), $GHASHDATA0_XMM
+    vpshufb         $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM
+    vpxor           $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    @{[ _ghash_mul  $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
+                    $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
+    add             \$16, $AAD
+    sub             \$16, $AADLEN
+    jnz             .Laad_loop_blockbyblock$local_label_suffix
+
+.Laad_done$local_label_suffix:
+    # Store the updated GHASH accumulator back to memory.
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+___
+    return $code;
+}
+
+# Do one non-last round of AES encryption on the counter blocks in V0-V3 using
+# the round key that has been broadcast to all 128-bit lanes of \round_key.
+sub _vaesenc_4x {
+    my ($round_key) = @_;
+    return <<___;
+    vaesenc         $round_key, $V0, $V0
+    vaesenc         $round_key, $V1, $V1
+    vaesenc         $round_key, $V2, $V2
+    vaesenc         $round_key, $V3, $V3
+___
+}
+
+# Start the AES encryption of four vectors of counter blocks.
+sub _ctr_begin_4x {
+    return <<___;
+    # Increment LE_CTR four times to generate four vectors of little-endian
+    # counter blocks, swap each to big-endian, and store them in V0-V3.
+    vpshufb         $BSWAP_MASK, $LE_CTR, $V0
+    vpaddd          $LE_CTR_INC, $LE_CTR, $LE_CTR
+    vpshufb         $BSWAP_MASK, $LE_CTR, $V1
+    vpaddd          $LE_CTR_INC, $LE_CTR, $LE_CTR
+    vpshufb         $BSWAP_MASK, $LE_CTR, $V2
+    vpaddd          $LE_CTR_INC, $LE_CTR, $LE_CTR
+    vpshufb         $BSWAP_MASK, $LE_CTR, $V3
+    vpaddd          $LE_CTR_INC, $LE_CTR, $LE_CTR
+
+    # AES "round zero": XOR in the zero-th round key.
+    vpxord          $RNDKEY0, $V0, $V0
+    vpxord          $RNDKEY0, $V1, $V1
+    vpxord          $RNDKEY0, $V2, $V2
+    vpxord          $RNDKEY0, $V3, $V3
+___
+}
+
+# Do the last AES round for four vectors of counter blocks V0-V3, XOR source
+# data with the resulting keystream, and write the result to DST and
+# GHASHDATA[0-3].  (Implementation differs slightly, but has the same effect.)
+sub _aesenclast_and_xor_4x {
+    return <<___;
+    # XOR the source data with the last round key, saving the result in
+    # GHASHDATA[0-3].  This reduces latency by taking advantage of the
+    # property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+    vpxord          0*$VL($SRC), $RNDKEYLAST, $GHASHDATA0
+    vpxord          1*$VL($SRC), $RNDKEYLAST, $GHASHDATA1
+    vpxord          2*$VL($SRC), $RNDKEYLAST, $GHASHDATA2
+    vpxord          3*$VL($SRC), $RNDKEYLAST, $GHASHDATA3
+
+    # Do the last AES round.  This handles the XOR with the source data
+    # too, as per the optimization described above.
+    vaesenclast     $GHASHDATA0, $V0, $GHASHDATA0
+    vaesenclast     $GHASHDATA1, $V1, $GHASHDATA1
+    vaesenclast     $GHASHDATA2, $V2, $GHASHDATA2
+    vaesenclast     $GHASHDATA3, $V3, $GHASHDATA3
+
+    # Store the en/decrypted data to DST.
+    vmovdqu8        $GHASHDATA0, 0*$VL($DST)
+    vmovdqu8        $GHASHDATA1, 1*$VL($DST)
+    vmovdqu8        $GHASHDATA2, 2*$VL($DST)
+    vmovdqu8        $GHASHDATA3, 3*$VL($DST)
+___
+}
+
+$g_update_macro_expansion_count = 0;
+
+# void aes_gcm_{enc,dec}_update_##suffix(const uint8_t *in, uint8_t *out,
+#                                        size_t len, const AES_KEY *key,
+#                                        const uint8_t ivec[16],
+#                                        const u128 Htable[16],
+#                                        uint8_t Xi[16]);
+#
+# This macro generates a GCM encryption or decryption update function with the
+# above prototype (with \enc selecting which one).  This macro supports both
+# VL=32 and VL=64.  _set_veclen must have been invoked with the desired length.
+#
+# This function computes the next portion of the CTR keystream, XOR's it with
+# |len| bytes from |in|, and writes the resulting encrypted or decrypted data
+# to |out|.  It also updates the GHASH accumulator |Xi| using the next |len|
+# ciphertext bytes.
+#
+# |len| must be a multiple of 16, except on the last call where it can be any
+# length.  The caller must do any buffering needed to ensure this.  Both
+# in-place and out-of-place en/decryption are supported.
+#
+# |ivec| must give the current counter in big-endian format.  This function
+# loads the counter from |ivec| and increments the loaded counter as needed, but
+# it does *not* store the updated counter back to |ivec|.  The caller must
+# update |ivec| if any more data segments follow.  Internally, only the low
+# 32-bit word of the counter is incremented, following the GCM standard.
+sub _aes_gcm_update {
+    my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count;
+
+    my ($enc) = @_;
+
+    my $code = "";
+
+    # Function arguments
+    ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $H_POWERS, $GHASH_ACC_PTR ) =
+      $win64
+      ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" )
+      : ( @argregs[ 0 .. 5 ], "%r12" );
+
+    # Additional local variables
+
+    # %rax, %k1, and %k2 are used as temporary registers.  BE_CTR_PTR is
+    # also available as a temporary register after the counter is loaded.
+
+    # AES key length in bytes
+    ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" );
+
+    # Pointer to the last AES round key for the chosen AES variant
+    $RNDKEYLAST_PTR = "%r11";
+
+    # In the main loop, V0-V3 are used as AES input and output.  Elsewhere
+    # they are used as temporary registers.
+
+    # GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
+    ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V4, "%xmm4" );
+    ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V5, "%xmm5" );
+    ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V6, "%xmm6" );
+    ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V7, "%xmm7" );
+
+    # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+    # using vpshufb, copied to all 128-bit lanes.
+    ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V8, "%xmm8" );
+
+    # RNDKEY temporarily holds the next AES round key.
+    $RNDKEY = $V9;
+
+    # GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
+    # only the lowest 128-bit lane can be nonzero.  When not fully reduced,
+    # more than one lane may be used, and they need to be XOR'd together.
+    ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( $V10, "%xmm10" );
+
+    # LE_CTR_INC is the vector of 32-bit words that need to be added to a
+    # vector of little-endian counter blocks to advance it forwards.
+    $LE_CTR_INC = $V11;
+
+    # LE_CTR contains the next set of little-endian counter blocks.
+    $LE_CTR = $V12;
+
+    # RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
+    # copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
+    # RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
+    (
+        $RNDKEY0,   $RNDKEYLAST, $RNDKEY_M9, $RNDKEY_M8,
+        $RNDKEY_M7, $RNDKEY_M6,  $RNDKEY_M5, $RNDKEY_M4,
+        $RNDKEY_M3, $RNDKEY_M2,  $RNDKEY_M1
+    ) = ( $V13, $V14, $V15, $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23 );
+
+    # GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
+    # cannot coincide with anything used for AES encryption, since for
+    # performance reasons GHASH and AES encryption are interleaved.
+    ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V24, $V25, $V26 );
+
+    # H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1.  The
+    # descending numbering reflects the order of the key powers.
+    ( $H_POW4, $H_POW3, $H_POW2, $H_POW1 ) = ( $V27, $V28, $V29, $V30 );
+
+    # GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
+    $GFPOLY = $V31;
+
+    if ($win64) {
+        $code .= <<___;
+        @{[ _save_gpregs $BE_CTR_PTR, $H_POWERS, $GHASH_ACC_PTR ]}
+        mov             64(%rsp), $BE_CTR_PTR     # arg5
+        mov             72(%rsp), $H_POWERS       # arg6
+        mov             80(%rsp), $GHASH_ACC_PTR  # arg7
+        @{[ _save_xmmregs (6 .. 15) ]}
+        .seh_endprologue
+___
+    }
+    else {
+        $code .= <<___;
+        @{[ _save_gpregs $GHASH_ACC_PTR ]}
+        mov             16(%rsp), $GHASH_ACC_PTR  # arg7
+___
+    }
+
+    if ($enc) {
+        $code .= <<___;
+#ifdef BORINGSSL_DISPATCH_TEST
+        .extern BORINGSSL_function_hit
+        movb \$1,BORINGSSL_function_hit+@{[ $VL < 64 ? 6 : 7 ]}(%rip)
+#endif
+___
+    }
+    $code .= <<___;
+    # Load some constants.
+    vbroadcasti32x4 .Lbswap_mask(%rip), $BSWAP_MASK
+    vbroadcasti32x4 .Lgfpoly(%rip), $GFPOLY
+
+    # Load the GHASH accumulator and the starting counter.
+    # BoringSSL passes these values in big endian format.
+    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vbroadcasti32x4 ($BE_CTR_PTR), $LE_CTR
+    vpshufb         $BSWAP_MASK, $LE_CTR, $LE_CTR
+
+    # Load the AES key length in bytes.  BoringSSL stores number of rounds
+    # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20.
+    movl            $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN
+    lea             -20(,$AESKEYLEN,4), $AESKEYLEN
+
+    # Make RNDKEYLAST_PTR point to the last AES round key.  This is the
+    # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+    # respectively.  Then load the zero-th and last round keys.
+    lea             6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR
+    vbroadcasti32x4 ($AESKEY), $RNDKEY0
+    vbroadcasti32x4 ($RNDKEYLAST_PTR), $RNDKEYLAST
+
+    # Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
+    vpaddd          .Lctr_pattern(%rip), $LE_CTR, $LE_CTR
+
+    # Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
+    vbroadcasti32x4 .Linc_@{[ $VL / 16 ]}blocks(%rip), $LE_CTR_INC
+
+    # If there are at least 4*VL bytes of data, then continue into the loop
+    # that processes 4*VL bytes of data at a time.  Otherwise skip it.
+    cmp             \$4*$VL-1, $DATALEN
+    jbe             .Lcrypt_loop_4x_done$local_label_suffix
+
+    # Load powers of the hash key.
+    vmovdqu8        $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4
+    vmovdqu8        $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3
+    vmovdqu8        $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2
+    vmovdqu8        $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1
+___
+
+    # Main loop: en/decrypt and hash 4 vectors at a time.
+    #
+    # When possible, interleave the AES encryption of the counter blocks
+    # with the GHASH update of the ciphertext blocks.  This improves
+    # performance on many CPUs because the execution ports used by the VAES
+    # instructions often differ from those used by vpclmulqdq and other
+    # instructions used in GHASH.  For example, many Intel CPUs dispatch
+    # vaesenc to ports 0 and 1 and vpclmulqdq to port 5.
+    #
+    # The interleaving is easiest to do during decryption, since during
+    # decryption the ciphertext blocks are immediately available.  For
+    # encryption, instead encrypt the first set of blocks, then hash those
+    # blocks while encrypting the next set of blocks, repeat that as
+    # needed, and finally hash the last set of blocks.
+
+    if ($enc) {
+        $code .= <<___;
+        # Encrypt the first 4 vectors of plaintext blocks.  Leave the resulting
+        # ciphertext in GHASHDATA[0-3] for GHASH.
+        @{[ _ctr_begin_4x ]}
+        lea             16($AESKEY), %rax
+.Lvaesenc_loop_first_4_vecs$local_label_suffix:
+        vbroadcasti32x4 (%rax), $RNDKEY
+        @{[ _vaesenc_4x $RNDKEY ]}
+        add             \$16, %rax
+        cmp             %rax, $RNDKEYLAST_PTR
+        jne             .Lvaesenc_loop_first_4_vecs$local_label_suffix
+        @{[ _aesenclast_and_xor_4x ]}
+        sub             \$-4*$VL, $SRC  # shorter than 'add 4*VL' when VL=32
+        sub             \$-4*$VL, $DST
+        add             \$-4*$VL, $DATALEN
+        cmp             \$4*$VL-1, $DATALEN
+        jbe             .Lghash_last_ciphertext_4x$local_label_suffix
+___
+    }
+
+    # Cache as many additional AES round keys as possible.
+    for my $i ( reverse 1 .. 9 ) {
+        $code .= <<___;
+        vbroadcasti32x4 -$i*16($RNDKEYLAST_PTR), ${"RNDKEY_M$i"}
+___
+    }
+
+    $code .= <<___;
+.Lcrypt_loop_4x$local_label_suffix:
+___
+
+    # If decrypting, load more ciphertext blocks into GHASHDATA[0-3].  If
+    # encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
+    if ( !$enc ) {
+        $code .= <<___;
+        vmovdqu8        0*$VL($SRC), $GHASHDATA0
+        vmovdqu8        1*$VL($SRC), $GHASHDATA1
+        vmovdqu8        2*$VL($SRC), $GHASHDATA2
+        vmovdqu8        3*$VL($SRC), $GHASHDATA3
+___
+    }
+
+    $code .= <<___;
+    # Start the AES encryption of the counter blocks.
+    @{[ _ctr_begin_4x ]}
+    cmp             \$24, $AESKEYLEN
+    jl              .Laes128$local_label_suffix
+    je              .Laes192$local_label_suffix
+    # AES-256
+    vbroadcasti32x4 -13*16($RNDKEYLAST_PTR), $RNDKEY
+    @{[ _vaesenc_4x $RNDKEY ]}
+    vbroadcasti32x4 -12*16($RNDKEYLAST_PTR), $RNDKEY
+    @{[ _vaesenc_4x $RNDKEY ]}
+.Laes192$local_label_suffix:
+    vbroadcasti32x4 -11*16($RNDKEYLAST_PTR), $RNDKEY
+    @{[ _vaesenc_4x $RNDKEY ]}
+    vbroadcasti32x4 -10*16($RNDKEYLAST_PTR), $RNDKEY
+    @{[ _vaesenc_4x $RNDKEY ]}
+.Laes128$local_label_suffix:
+___
+
+    # Finish the AES encryption of the counter blocks in V0-V3, interleaved
+    # with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
+    for my $i ( reverse 1 .. 9 ) {
+        $code .= <<___;
+        @{[ _ghash_step_4x  (9 - $i) ]}
+        @{[ _vaesenc_4x     ${"RNDKEY_M$i"} ]}
+___
+    }
+    $code .= <<___;
+    @{[ _ghash_step_4x  9 ]}
+    @{[ _aesenclast_and_xor_4x ]}
+    sub             \$-4*$VL, $SRC  # shorter than 'add 4*VL' when VL=32
+    sub             \$-4*$VL, $DST
+    add             \$-4*$VL, $DATALEN
+    cmp             \$4*$VL-1, $DATALEN
+    ja              .Lcrypt_loop_4x$local_label_suffix
+___
+
+    if ($enc) {
+
+        # Update GHASH with the last set of ciphertext blocks.
+        $code .= <<___;
+.Lghash_last_ciphertext_4x$local_label_suffix:
+        @{[ _ghash_4x ]}
+___
+    }
+
+    my $POWERS_PTR = $BE_CTR_PTR;    # BE_CTR_PTR is free to be reused.
+
+    $code .= <<___;
+.Lcrypt_loop_4x_done$local_label_suffix:
+    # Check whether any data remains.
+    test            $DATALEN, $DATALEN
+    jz              .Ldone$local_label_suffix
+
+    # The data length isn't a multiple of 4*VL.  Process the remaining data
+    # of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
+    # Going one vector at a time may seem inefficient compared to having
+    # separate code paths for each possible number of vectors remaining.
+    # However, using a loop keeps the code size down, and it performs
+    # surprising well; modern CPUs will start executing the next iteration
+    # before the previous one finishes and also predict the number of loop
+    # iterations.  For a similar reason, we roll up the AES rounds.
+    #
+    # On the last iteration, the remaining length may be less than VL.
+    # Handle this using masking.
+    #
+    # Since there are enough key powers available for all remaining data,
+    # there is no need to do a GHASH reduction after each iteration.
+    # Instead, multiply each remaining block by its own key power, and only
+    # do a GHASH reduction at the very end.
+
+    # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+    # is the number of blocks that remain.
+    mov             $DATALEN, %rax
+    neg             %rax
+    and             \$-16, %rax  # -round_up(DATALEN, 16)
+    lea             $OFFSETOFEND_H_POWERS($H_POWERS,%rax), $POWERS_PTR
+___
+
+    # Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+    my ( $LO, $LO_XMM ) = ( $GHASHDATA0, $GHASHDATA0_XMM );
+    my ( $MI, $MI_XMM ) = ( $GHASHDATA1, $GHASHDATA1_XMM );
+    my ( $HI, $HI_XMM ) = ( $GHASHDATA2, $GHASHDATA2_XMM );
+    $code .= <<___;
+    vpxor           $LO_XMM, $LO_XMM, $LO_XMM
+    vpxor           $MI_XMM, $MI_XMM, $MI_XMM
+    vpxor           $HI_XMM, $HI_XMM, $HI_XMM
+
+    cmp             \$$VL, $DATALEN
+    jb              .Lpartial_vec$local_label_suffix
+
+.Lcrypt_loop_1x$local_label_suffix:
+    # Process a full vector of length VL.
+
+    # Encrypt a vector of counter blocks.
+    vpshufb         $BSWAP_MASK, $LE_CTR, $V0
+    vpaddd          $LE_CTR_INC, $LE_CTR, $LE_CTR
+    vpxord          $RNDKEY0, $V0, $V0
+    lea             16($AESKEY), %rax
+.Lvaesenc_loop_tail_full_vec$local_label_suffix:
+    vbroadcasti32x4 (%rax), $RNDKEY
+    vaesenc         $RNDKEY, $V0, $V0
+    add             \$16, %rax
+    cmp             %rax, $RNDKEYLAST_PTR
+    jne             .Lvaesenc_loop_tail_full_vec$local_label_suffix
+    vaesenclast     $RNDKEYLAST, $V0, $V0
+
+    # XOR the data with the vector of keystream blocks.
+    vmovdqu8        ($SRC), $V1
+    vpxord          $V1, $V0, $V0
+    vmovdqu8        $V0, ($DST)
+
+    # Update GHASH with the ciphertext blocks, without reducing.
+    vmovdqu8        ($POWERS_PTR), $H_POW1
+    vpshufb         $BSWAP_MASK, @{[ $enc ? $V0 : $V1 ]}, $V0
+    vpxord          $GHASH_ACC, $V0, $V0
+    @{[ _ghash_mul_noreduce $H_POW1, $V0, $LO, $MI, $HI, $GHASHDATA3,
+                            $V1, $V2, $V3 ]}
+    vpxor           $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+
+    add             \$$VL, $POWERS_PTR
+    add             \$$VL, $SRC
+    add             \$$VL, $DST
+    sub             \$$VL, $DATALEN
+    cmp             \$$VL, $DATALEN
+    jae             .Lcrypt_loop_1x$local_label_suffix
+
+    test            $DATALEN, $DATALEN
+    jz              .Lreduce$local_label_suffix
+
+.Lpartial_vec$local_label_suffix:
+    # Process a partial vector of length 1 <= DATALEN < VL.
+
+    # Set the data mask %k1 to DATALEN 1's.
+    # Set the key powers mask %k2 to round_up(DATALEN, 16) 1's.
+    mov             \$-1, %rax
+    bzhi            $DATALEN, %rax, %rax
+    @{[ $VL < 64 ? "kmovd %eax, %k1" : "kmovq %rax, %k1" ]}
+    add             \$15, $DATALEN
+    and             \$-16, $DATALEN
+    mov             \$-1, %rax
+    bzhi            $DATALEN, %rax, %rax
+    @{[ $VL < 64 ? "kmovd %eax, %k2" : "kmovq %rax, %k2" ]}
+
+    # Encrypt one last vector of counter blocks.  This does not need to be
+    # masked.  The counter does not need to be incremented here.
+    vpshufb         $BSWAP_MASK, $LE_CTR, $V0
+    vpxord          $RNDKEY0, $V0, $V0
+    lea             16($AESKEY), %rax
+.Lvaesenc_loop_tail_partialvec$local_label_suffix:
+    vbroadcasti32x4 (%rax), $RNDKEY
+    vaesenc         $RNDKEY, $V0, $V0
+    add             \$16, %rax
+    cmp             %rax, $RNDKEYLAST_PTR
+    jne             .Lvaesenc_loop_tail_partialvec$local_label_suffix
+    vaesenclast     $RNDKEYLAST, $V0, $V0
+
+    # XOR the data with the appropriate number of keystream bytes.
+    vmovdqu8        ($SRC), $V1\{%k1}{z}
+    vpxord          $V1, $V0, $V0
+    vmovdqu8        $V0, ($DST){%k1}
+
+    # Update GHASH with the ciphertext block(s), without reducing.
+    #
+    # In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
+    # (If decrypting, it's done by the above masked load.  If encrypting,
+    # it's done by the below masked register-to-register move.)  Note that
+    # if DATALEN <= VL - 16, there will be additional padding beyond the
+    # padding of the last block specified by GHASH itself; i.e., there may
+    # be whole block(s) that get processed by the GHASH multiplication and
+    # reduction instructions but should not actually be included in the
+    # GHASH.  However, any such blocks are all-zeroes, and the values that
+    # they're multiplied with are also all-zeroes.  Therefore they just add
+    # 0 * 0 = 0 to the final GHASH result, which makes no difference.
+    vmovdqu8        ($POWERS_PTR), $H_POW1\{%k2}{z}
+    @{[ $enc ? "vmovdqu8 $V0, $V1\{%k1}{z}" : "" ]}
+    vpshufb         $BSWAP_MASK, $V1, $V0
+    vpxord          $GHASH_ACC, $V0, $V0
+    @{[ _ghash_mul_noreduce $H_POW1, $V0, $LO, $MI, $HI, $GHASHDATA3,
+                            $V1, $V2, $V3 ]}
+
+.Lreduce$local_label_suffix:
+    # Finally, do the GHASH reduction.
+    @{[ _ghash_reduce   $LO, $MI, $HI, $GFPOLY, $V0 ]}
+    @{[ _horizontal_xor $HI, $HI_XMM, $GHASH_ACC_XMM,
+                        "%xmm0", "%xmm1", "%xmm2" ]}
+
+.Ldone$local_label_suffix:
+    # Store the updated GHASH accumulator back to memory.
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper      # This is needed after using ymm or zmm registers.
+___
+    return $code;
+}
+
+# void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
+$code .= _begin_func "gcm_gmult_vpclmulqdq_avx10", 1;
+{
+    my ( $GHASH_ACC_PTR, $H_POWERS ) = @argregs[ 0 .. 1 ];
+    my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
+      map( "%xmm$_", ( 0 .. 6 ) );
+
+    $code .= <<___;
+    @{[ _save_xmmregs (6) ]}
+    .seh_endprologue
+
+    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC
+    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK
+    vmovdqu         $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1
+    vmovdqu         .Lgfpoly(%rip), $GFPOLY
+    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
+
+    @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}
+
+    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
+    vmovdqu         $GHASH_ACC, ($GHASH_ACC_PTR)
+___
+}
+$code .= _end_func;
+
+_set_veclen 32;
+
+$code .= _begin_func "gcm_init_vpclmulqdq_avx10", 0;
+$code .= _aes_gcm_init;
+$code .= _end_func;
+
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_256", 1;
+$code .= _ghash_update;
+$code .= _end_func;
+
+$code .= _begin_func "aes_gcm_enc_update_vaes_avx10_256", 1;
+$code .= _aes_gcm_update 1;
+$code .= _end_func;
+
+$code .= _begin_func "aes_gcm_dec_update_vaes_avx10_256", 1;
+$code .= _aes_gcm_update 0;
+$code .= _end_func;
+
+_set_veclen 64;
+
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512", 1;
+$code .= _ghash_update;
+$code .= _end_func;
+
+$code .= _begin_func "aes_gcm_enc_update_vaes_avx10_512", 1;
+$code .= _aes_gcm_update 1;
+$code .= _end_func;
+
+$code .= _begin_func "aes_gcm_dec_update_vaes_avx10_512", 1;
+$code .= _aes_gcm_update 0;
+$code .= _end_func;
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
+exit 0;
diff --git a/crypto/fipsmodule/modes/gcm.cc.inc b/crypto/fipsmodule/modes/gcm.cc.inc
index 8413951..2d67eea 100644
--- a/crypto/fipsmodule/modes/gcm.cc.inc
+++ b/crypto/fipsmodule/modes/gcm.cc.inc
@@ -135,14 +135,42 @@
 #if defined(HW_GCM) && defined(OPENSSL_X86_64)
 static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                              const AES_KEY *key, uint8_t ivec[16],
-                             uint8_t Xi[16], const u128 Htable[16]) {
-  return aesni_gcm_encrypt(in, out, len, key, ivec, Htable, Xi);
+                             uint8_t Xi[16], const u128 Htable[16],
+                             enum gcm_impl_t impl) {
+  switch (impl) {
+    case gcm_x86_vaes_avx10_256:
+      len &= kSizeTWithoutLower4Bits;
+      aes_gcm_enc_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi);
+      CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+      return len;
+    case gcm_x86_vaes_avx10_512:
+      len &= kSizeTWithoutLower4Bits;
+      aes_gcm_enc_update_vaes_avx10_512(in, out, len, key, ivec, Htable, Xi);
+      CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+      return len;
+    default:
+      return aesni_gcm_encrypt(in, out, len, key, ivec, Htable, Xi);
+  }
 }
 
 static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
                              const AES_KEY *key, uint8_t ivec[16],
-                             uint8_t Xi[16], const u128 Htable[16]) {
-  return aesni_gcm_decrypt(in, out, len, key, ivec, Htable, Xi);
+                             uint8_t Xi[16], const u128 Htable[16],
+                             enum gcm_impl_t impl) {
+  switch (impl) {
+    case gcm_x86_vaes_avx10_256:
+      len &= kSizeTWithoutLower4Bits;
+      aes_gcm_dec_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi);
+      CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+      return len;
+    case gcm_x86_vaes_avx10_512:
+      len &= kSizeTWithoutLower4Bits;
+      aes_gcm_dec_update_vaes_avx10_512(in, out, len, key, ivec, Htable, Xi);
+      CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+      return len;
+    default:
+      return aesni_gcm_decrypt(in, out, len, key, ivec, Htable, Xi);
+  }
 }
 #endif  // HW_GCM && X86_64
 
@@ -150,7 +178,8 @@
 
 static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                              const AES_KEY *key, uint8_t ivec[16],
-                             uint8_t Xi[16], const u128 Htable[16]) {
+                             uint8_t Xi[16], const u128 Htable[16],
+                             enum gcm_impl_t impl) {
   const size_t len_blocks = len & kSizeTWithoutLower4Bits;
   if (!len_blocks) {
     return 0;
@@ -161,7 +190,8 @@
 
 static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
                              const AES_KEY *key, uint8_t ivec[16],
-                             uint8_t Xi[16], const u128 Htable[16]) {
+                             uint8_t Xi[16], const u128 Htable[16],
+                             enum gcm_impl_t impl) {
   const size_t len_blocks = len & kSizeTWithoutLower4Bits;
   if (!len_blocks) {
     return 0;
@@ -173,21 +203,28 @@
 #endif  // HW_GCM && AARCH64
 
 void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
-                       u128 out_table[16], int *out_is_avx,
-                       const uint8_t gcm_key[16]) {
-  *out_is_avx = 0;
-
+                       u128 out_table[16], const uint8_t gcm_key[16]) {
   // H is passed to |gcm_init_*| as a pair of byte-swapped, 64-bit values.
   uint64_t H[2] = {CRYPTO_load_u64_be(gcm_key),
                    CRYPTO_load_u64_be(gcm_key + 8)};
 
 #if defined(GHASH_ASM_X86_64)
   if (crypto_gcm_clmul_enabled()) {
+    if (CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
+        CRYPTO_is_VPCLMULQDQ_capable() && CRYPTO_is_BMI2_capable()) {
+      gcm_init_vpclmulqdq_avx10(out_table, H);
+      *out_mult = gcm_gmult_vpclmulqdq_avx10;
+      if (CRYPTO_cpu_avoid_zmm_registers()) {
+        *out_hash = gcm_ghash_vpclmulqdq_avx10_256;
+      } else {
+        *out_hash = gcm_ghash_vpclmulqdq_avx10_512;
+      }
+      return;
+    }
     if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
       gcm_init_avx(out_table, H);
       *out_mult = gcm_gmult_avx;
       *out_hash = gcm_ghash_avx;
-      *out_is_avx = 1;
       return;
     }
     gcm_init_clmul(out_table, H);
@@ -244,14 +281,25 @@
   OPENSSL_memset(ghash_key, 0, sizeof(ghash_key));
   (*block)(ghash_key, ghash_key, aes_key);
 
-  int is_avx;
-  CRYPTO_ghash_init(&gcm_key->gmult, &gcm_key->ghash, gcm_key->Htable, &is_avx,
+  CRYPTO_ghash_init(&gcm_key->gmult, &gcm_key->ghash, gcm_key->Htable,
                     ghash_key);
 
-#if defined(OPENSSL_AARCH64) && !defined(OPENSSL_NO_ASM)
-  gcm_key->use_hw_gcm_crypt = (gcm_pmull_capable() && block_is_hwaes) ? 1 : 0;
-#else
-  gcm_key->use_hw_gcm_crypt = (is_avx && block_is_hwaes) ? 1 : 0;
+#if !defined(OPENSSL_NO_ASM)
+#if defined(OPENSSL_X86_64)
+  if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx10_256 &&
+      CRYPTO_is_VAES_capable()) {
+    gcm_key->impl = gcm_x86_vaes_avx10_256;
+  } else if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx10_512 &&
+             CRYPTO_is_VAES_capable()) {
+    gcm_key->impl = gcm_x86_vaes_avx10_512;
+  } else if (gcm_key->ghash == gcm_ghash_avx && block_is_hwaes) {
+    gcm_key->impl = gcm_x86_aesni;
+  }
+#elif defined(OPENSSL_AARCH64)
+  if (gcm_pmull_capable() && block_is_hwaes) {
+    gcm_key->impl = gcm_arm64_aes;
+  }
+#endif
 #endif
 }
 
@@ -565,11 +613,11 @@
 
 #if defined(HW_GCM)
   // Check |len| to work around a C language bug. See https://crbug.com/1019588.
-  if (ctx->gcm_key.use_hw_gcm_crypt && len > 0) {
+  if (ctx->gcm_key.impl != gcm_separate && len > 0) {
     // |hw_gcm_encrypt| may not process all the input given to it. It may
     // not process *any* of its input if it is deemed too small.
     size_t bulk = hw_gcm_encrypt(in, out, len, key, ctx->Yi, ctx->Xi,
-                                 ctx->gcm_key.Htable);
+                                 ctx->gcm_key.Htable, ctx->gcm_key.impl);
     in += bulk;
     out += bulk;
     len -= bulk;
@@ -654,11 +702,11 @@
 
 #if defined(HW_GCM)
   // Check |len| to work around a C language bug. See https://crbug.com/1019588.
-  if (ctx->gcm_key.use_hw_gcm_crypt && len > 0) {
+  if (ctx->gcm_key.impl != gcm_separate && len > 0) {
     // |hw_gcm_decrypt| may not process all the input given to it. It may
     // not process *any* of its input if it is deemed too small.
     size_t bulk = hw_gcm_decrypt(in, out, len, key, ctx->Yi, ctx->Xi,
-                                 ctx->gcm_key.Htable);
+                                 ctx->gcm_key.Htable, ctx->gcm_key.impl);
     in += bulk;
     out += bulk;
     len -= bulk;
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index 1729e0d..53415f4 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -81,6 +81,45 @@
         }
       }
     }
+    if (CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() &&
+        CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
+        CRYPTO_is_BMI2_capable()) {
+      AES_KEY aes_key;
+      static const uint8_t kKey[16] = {0};
+      uint8_t iv[16] = {0};
+
+      CHECK_ABI_SEH(gcm_init_vpclmulqdq_avx10, Htable, kH);
+      CHECK_ABI_SEH(gcm_gmult_vpclmulqdq_avx10, X, Htable);
+      for (size_t blocks : kBlockCounts) {
+        CHECK_ABI_SEH(gcm_ghash_vpclmulqdq_avx10_256, X, Htable, buf,
+                      16 * blocks);
+        CHECK_ABI_SEH(gcm_ghash_vpclmulqdq_avx10_512, X, Htable, buf,
+                      16 * blocks);
+      }
+
+      aes_hw_set_encrypt_key(kKey, 128, &aes_key);
+      for (size_t blocks : kBlockCounts) {
+        CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_256, buf, buf, blocks * 16,
+                      &aes_key, iv, Htable, X);
+        CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_256, buf, buf,
+                      blocks * 16 + 7, &aes_key, iv, Htable, X);
+        CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_512, buf, buf, blocks * 16,
+                      &aes_key, iv, Htable, X);
+        CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx10_512, buf, buf,
+                      blocks * 16 + 7, &aes_key, iv, Htable, X);
+      }
+      aes_hw_set_decrypt_key(kKey, 128, &aes_key);
+      for (size_t blocks : kBlockCounts) {
+        CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_256, buf, buf, blocks * 16,
+                      &aes_key, iv, Htable, X);
+        CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_256, buf, buf,
+                      blocks * 16 + 7, &aes_key, iv, Htable, X);
+        CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_512, buf, buf, blocks * 16,
+                      &aes_key, iv, Htable, X);
+        CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx10_512, buf, buf,
+                      blocks * 16 + 7, &aes_key, iv, Htable, X);
+      }
+    }
 #endif  // GHASH_ASM_X86_64
   }
 #endif  // GHASH_ASM_X86 || GHASH_ASM_X86_64
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index 601dab7..4cedc39 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -126,6 +126,15 @@
 // can be safely copied. Additionally, |gcm_key| is split into a separate
 // struct.
 
+// gcm_impl_t specifies an assembly implementation of AES-GCM.
+enum gcm_impl_t {
+  gcm_separate = 0,  // No combined AES-GCM, but may have AES-CTR and GHASH.
+  gcm_x86_aesni,
+  gcm_x86_vaes_avx10_256,
+  gcm_x86_vaes_avx10_512,
+  gcm_arm64_aes,
+};
+
 typedef struct { uint64_t hi,lo; } u128;
 
 // gmult_func multiplies |Xi| by the GCM key and writes the result back to
@@ -148,10 +157,7 @@
   ghash_func ghash;
 
   block128_f block;
-
-  // use_hw_gcm_crypt is true if this context should use platform-specific
-  // assembly to process GCM data.
-  unsigned use_hw_gcm_crypt:1;
+  enum gcm_impl_t impl;
 } GCM128_KEY;
 
 // GCM128_CONTEXT contains state for a single GCM operation. The structure
@@ -182,11 +188,9 @@
 
 // CRYPTO_ghash_init writes a precomputed table of powers of |gcm_key| to
 // |out_table| and sets |*out_mult| and |*out_hash| to (potentially hardware
-// accelerated) functions for performing operations in the GHASH field. If the
-// AVX implementation was used |*out_is_avx| will be true.
+// accelerated) functions for performing operations in the GHASH field.
 void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
-                       u128 out_table[16], int *out_is_avx,
-                       const uint8_t gcm_key[16]);
+                       u128 out_table[16], const uint8_t gcm_key[16]);
 
 // CRYPTO_gcm128_init_key initialises |gcm_key| to use |block| (typically AES)
 // with the given key. |block_is_hwaes| is one if |block| is |aes_hw_encrypt|.
@@ -279,6 +283,30 @@
 size_t aesni_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
                          const AES_KEY *key, uint8_t ivec[16],
                          const u128 Htable[16], uint8_t Xi[16]);
+
+void gcm_init_vpclmulqdq_avx10(u128 Htable[16], const uint64_t H[2]);
+void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_vpclmulqdq_avx10_256(uint8_t Xi[16], const u128 Htable[16],
+                                    const uint8_t *in, size_t len);
+void gcm_ghash_vpclmulqdq_avx10_512(uint8_t Xi[16], const u128 Htable[16],
+                                    const uint8_t *in, size_t len);
+void aes_gcm_enc_update_vaes_avx10_256(const uint8_t *in, uint8_t *out,
+                                       size_t len, const AES_KEY *key,
+                                       const uint8_t ivec[16],
+                                       const u128 Htable[16], uint8_t Xi[16]);
+void aes_gcm_dec_update_vaes_avx10_256(const uint8_t *in, uint8_t *out,
+                                       size_t len, const AES_KEY *key,
+                                       const uint8_t ivec[16],
+                                       const u128 Htable[16], uint8_t Xi[16]);
+void aes_gcm_enc_update_vaes_avx10_512(const uint8_t *in, uint8_t *out,
+                                       size_t len, const AES_KEY *key,
+                                       const uint8_t ivec[16],
+                                       const u128 Htable[16], uint8_t Xi[16]);
+void aes_gcm_dec_update_vaes_avx10_512(const uint8_t *in, uint8_t *out,
+                                       size_t len, const AES_KEY *key,
+                                       const uint8_t ivec[16],
+                                       const u128 Htable[16], uint8_t Xi[16]);
+
 #endif  // OPENSSL_X86_64
 
 #if defined(OPENSSL_X86)
diff --git a/crypto/fipsmodule/modes/polyval.cc.inc b/crypto/fipsmodule/modes/polyval.cc.inc
index 4e53222..5e9e664 100644
--- a/crypto/fipsmodule/modes/polyval.cc.inc
+++ b/crypto/fipsmodule/modes/polyval.cc.inc
@@ -56,8 +56,7 @@
   OPENSSL_memcpy(H, key, 16);
   reverse_and_mulX_ghash(H);
 
-  int is_avx;
-  CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, ctx->Htable, &is_avx, H);
+  CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, ctx->Htable, H);
   OPENSSL_memset(&ctx->S, 0, sizeof(ctx->S));
 }
 
diff --git a/crypto/impl_dispatch_test.cc b/crypto/impl_dispatch_test.cc
index 631e78f..3eb6cfa 100644
--- a/crypto/impl_dispatch_test.cc
+++ b/crypto/impl_dispatch_test.cc
@@ -36,6 +36,10 @@
     aesni_ = CRYPTO_is_AESNI_capable();
     avx_movbe_ = CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable();
     ssse3_ = CRYPTO_is_SSSE3_capable();
+    vaes_ = CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() &&
+            CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
+            CRYPTO_is_BMI2_capable();
+    avoid_zmm_ = CRYPTO_cpu_avoid_zmm_registers();
     is_x86_64_ =
 #if defined(OPENSSL_X86_64)
         true;
@@ -75,6 +79,8 @@
   bool avx_movbe_ = false;
   bool ssse3_ = false;
   bool is_x86_64_ = false;
+  bool vaes_ = false;
+  bool avoid_zmm_ = false;
 #endif
 };
 
@@ -87,16 +93,23 @@
 constexpr size_t kFlag_aes_hw_set_encrypt_key = 3;
 constexpr size_t kFlag_vpaes_encrypt = 4;
 constexpr size_t kFlag_vpaes_set_encrypt_key = 5;
+constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_256 = 6;
+constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_512 = 7;
 
 TEST_F(ImplDispatchTest, AEAD_AES_GCM) {
   AssertFunctionsHit(
       {
-          {kFlag_aes_hw_ctr32_encrypt_blocks, aesni_},
+          {kFlag_aes_hw_ctr32_encrypt_blocks, aesni_ && !(is_x86_64_ && vaes_)},
           {kFlag_aes_hw_encrypt, aesni_},
           {kFlag_aes_hw_set_encrypt_key, aesni_},
-          {kFlag_aesni_gcm_encrypt, is_x86_64_ && aesni_ && avx_movbe_},
+          {kFlag_aesni_gcm_encrypt,
+           is_x86_64_ && aesni_ && avx_movbe_ && !vaes_},
           {kFlag_vpaes_encrypt, ssse3_ && !aesni_},
           {kFlag_vpaes_set_encrypt_key, ssse3_ && !aesni_},
+          {kFlag_aes_gcm_enc_update_vaes_avx10_256,
+           is_x86_64_ && vaes_ && avoid_zmm_},
+          {kFlag_aes_gcm_enc_update_vaes_avx10_512,
+           is_x86_64_ && vaes_ && !avoid_zmm_},
       },
       [] {
         const uint8_t kZeros[16] = {0};
diff --git a/crypto/internal.h b/crypto/internal.h
index 8c1900b..46decef 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -1537,7 +1537,9 @@
 //   3: aes_hw_set_encrypt_key
 //   4: vpaes_encrypt
 //   5: vpaes_set_encrypt_key
-extern uint8_t BORINGSSL_function_hit[7];
+//   6: aes_gcm_enc_update_vaes_avx10_256
+//   7: aes_gcm_enc_update_vaes_avx10_512
+extern uint8_t BORINGSSL_function_hit[8];
 #endif  // BORINGSSL_DISPATCH_TEST
 
 // OPENSSL_vasprintf_internal is just like |vasprintf(3)|. If |system_malloc| is
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-apple.S b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
new file mode 100644
index 0000000..b75bb07
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
@@ -0,0 +1,2264 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.section	__DATA,__const
+.p2align	6
+
+
+L$bswap_mask:
+.quad	0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+L$gfpoly:
+.quad	1, 0xc200000000000000
+
+
+L$gfpoly_and_internal_carrybit:
+.quad	1, 0xc200000000000001
+
+
+
+
+
+L$ctr_pattern:
+.quad	0, 0
+.quad	1, 0
+L$inc_2blocks:
+.quad	2, 0
+.quad	3, 0
+L$inc_4blocks:
+.quad	4, 0
+
+.text	
+.globl	_gcm_gmult_vpclmulqdq_avx10
+.private_extern _gcm_gmult_vpclmulqdq_avx10
+
+.p2align	5
+_gcm_gmult_vpclmulqdq_avx10:
+
+
+_CET_ENDBR
+
+
+
+	vmovdqu	(%rdi),%xmm0
+	vmovdqu	L$bswap_mask(%rip),%xmm1
+	vmovdqu	256-16(%rsi),%xmm2
+	vmovdqu	L$gfpoly(%rip),%xmm3
+	vpshufb	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm4
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm5
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm6
+	vpxord	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm4,%xmm3,%xmm6
+	vpshufd	$0x4e,%xmm4,%xmm4
+	vpternlogd	$0x96,%xmm6,%xmm4,%xmm5
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x01,%xmm5,%xmm3,%xmm4
+	vpshufd	$0x4e,%xmm5,%xmm5
+	vpternlogd	$0x96,%xmm4,%xmm5,%xmm0
+
+
+	vpshufb	%xmm1,%xmm0,%xmm0
+	vmovdqu	%xmm0,(%rdi)
+	ret
+
+
+
+.globl	_gcm_init_vpclmulqdq_avx10
+.private_extern _gcm_init_vpclmulqdq_avx10
+
+.p2align	5
+_gcm_init_vpclmulqdq_avx10:
+
+
+_CET_ENDBR
+
+	leaq	256-32(%rdi),%r8
+
+
+
+	vpshufd	$0x4e,(%rsi),%xmm3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vpshufd	$0xd3,%xmm3,%xmm0
+	vpsrad	$31,%xmm0,%xmm0
+	vpaddq	%xmm3,%xmm3,%xmm3
+
+	vpternlogd	$0x78,L$gfpoly_and_internal_carrybit(%rip),%xmm0,%xmm3
+
+
+	vbroadcasti32x4	L$gfpoly(%rip),%ymm5
+
+
+
+
+
+
+
+
+	vpclmulqdq	$0x00,%xmm3,%xmm3,%xmm0
+	vpclmulqdq	$0x01,%xmm3,%xmm3,%xmm1
+	vpclmulqdq	$0x10,%xmm3,%xmm3,%xmm2
+	vpxord	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm5,%xmm2
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpternlogd	$0x96,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm3,%xmm4
+	vpclmulqdq	$0x01,%xmm1,%xmm5,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpternlogd	$0x96,%xmm0,%xmm1,%xmm4
+
+
+
+	vinserti128	$1,%xmm3,%ymm4,%ymm3
+	vinserti128	$1,%xmm4,%ymm4,%ymm4
+
+	vmovdqu8	%ymm3,(%r8)
+
+
+
+
+
+	movl	$7,%eax
+L$precompute_next__func1:
+	subq	$32,%r8
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm0
+	vpclmulqdq	$0x01,%ymm4,%ymm3,%ymm1
+	vpclmulqdq	$0x10,%ymm4,%ymm3,%ymm2
+	vpxord	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm5,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpternlogd	$0x96,%ymm2,%ymm0,%ymm1
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm3
+	vpclmulqdq	$0x01,%ymm1,%ymm5,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpternlogd	$0x96,%ymm0,%ymm1,%ymm3
+
+	vmovdqu8	%ymm3,(%r8)
+	decl	%eax
+	jnz	L$precompute_next__func1
+
+	vzeroupper
+	ret
+
+
+
+.globl	_gcm_ghash_vpclmulqdq_avx10_256
+.private_extern _gcm_ghash_vpclmulqdq_avx10_256
+
+.p2align	5
+_gcm_ghash_vpclmulqdq_avx10_256:
+
+
+_CET_ENDBR
+
+
+
+
+
+
+	vmovdqu	L$bswap_mask(%rip),%xmm4
+	vmovdqu	L$gfpoly(%rip),%xmm10
+
+
+	vmovdqu	(%rdi),%xmm5
+	vpshufb	%xmm4,%xmm5,%xmm5
+
+
+	cmpq	$32,%rcx
+	jb	L$aad_blockbyblock__func1
+
+
+
+	vshufi64x2	$0,%ymm4,%ymm4,%ymm4
+	vshufi64x2	$0,%ymm10,%ymm10,%ymm10
+
+
+	vmovdqu8	256-32(%rsi),%ymm9
+
+	cmpq	$128-1,%rcx
+	jbe	L$aad_loop_1x__func1
+
+
+	vmovdqu8	256-128(%rsi),%ymm6
+	vmovdqu8	256-96(%rsi),%ymm7
+	vmovdqu8	256-64(%rsi),%ymm8
+
+
+L$aad_loop_4x__func1:
+	vmovdqu8	0(%rdx),%ymm0
+	vmovdqu8	32(%rdx),%ymm1
+	vmovdqu8	64(%rdx),%ymm2
+	vmovdqu8	96(%rdx),%ymm3
+	vpshufb	%ymm4,%ymm0,%ymm0
+	vpxord	%ymm5,%ymm0,%ymm0
+	vpshufb	%ymm4,%ymm1,%ymm1
+	vpshufb	%ymm4,%ymm2,%ymm2
+	vpshufb	%ymm4,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm6,%ymm0,%ymm5
+	vpclmulqdq	$0x00,%ymm7,%ymm1,%ymm11
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm12
+	vpxord	%ymm11,%ymm5,%ymm5
+	vpclmulqdq	$0x00,%ymm9,%ymm3,%ymm13
+	vpternlogd	$0x96,%ymm13,%ymm12,%ymm5
+	vpclmulqdq	$0x01,%ymm6,%ymm0,%ymm11
+	vpclmulqdq	$0x01,%ymm7,%ymm1,%ymm12
+	vpclmulqdq	$0x01,%ymm8,%ymm2,%ymm13
+	vpternlogd	$0x96,%ymm13,%ymm12,%ymm11
+	vpclmulqdq	$0x01,%ymm9,%ymm3,%ymm12
+	vpclmulqdq	$0x10,%ymm6,%ymm0,%ymm13
+	vpternlogd	$0x96,%ymm13,%ymm12,%ymm11
+	vpclmulqdq	$0x10,%ymm7,%ymm1,%ymm12
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm13
+	vpternlogd	$0x96,%ymm13,%ymm12,%ymm11
+	vpclmulqdq	$0x01,%ymm5,%ymm10,%ymm13
+	vpclmulqdq	$0x10,%ymm9,%ymm3,%ymm12
+	vpxord	%ymm12,%ymm11,%ymm11
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm6,%ymm0,%ymm0
+	vpclmulqdq	$0x11,%ymm7,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm8,%ymm2,%ymm2
+	vpternlogd	$0x96,%ymm13,%ymm5,%ymm11
+	vpclmulqdq	$0x11,%ymm9,%ymm3,%ymm3
+	vpternlogd	$0x96,%ymm2,%ymm1,%ymm0
+	vpclmulqdq	$0x01,%ymm11,%ymm10,%ymm12
+	vpxord	%ymm3,%ymm0,%ymm5
+	vpshufd	$0x4e,%ymm11,%ymm11
+	vpternlogd	$0x96,%ymm12,%ymm11,%ymm5
+	vextracti32x4	$1,%ymm5,%xmm0
+	vpxord	%xmm0,%xmm5,%xmm5
+
+	subq	$-128,%rdx
+	addq	$-128,%rcx
+	cmpq	$128-1,%rcx
+	ja	L$aad_loop_4x__func1
+
+
+	cmpq	$32,%rcx
+	jb	L$aad_large_done__func1
+L$aad_loop_1x__func1:
+	vmovdqu8	(%rdx),%ymm0
+	vpshufb	%ymm4,%ymm0,%ymm0
+	vpxord	%ymm0,%ymm5,%ymm5
+	vpclmulqdq	$0x00,%ymm9,%ymm5,%ymm0
+	vpclmulqdq	$0x01,%ymm9,%ymm5,%ymm1
+	vpclmulqdq	$0x10,%ymm9,%ymm5,%ymm2
+	vpxord	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm10,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpternlogd	$0x96,%ymm2,%ymm0,%ymm1
+	vpclmulqdq	$0x11,%ymm9,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm1,%ymm10,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpternlogd	$0x96,%ymm0,%ymm1,%ymm5
+
+	vextracti32x4	$1,%ymm5,%xmm0
+	vpxord	%xmm0,%xmm5,%xmm5
+
+	addq	$32,%rdx
+	subq	$32,%rcx
+	cmpq	$32,%rcx
+	jae	L$aad_loop_1x__func1
+
+L$aad_large_done__func1:
+
+
+	vzeroupper
+
+
+L$aad_blockbyblock__func1:
+	testq	%rcx,%rcx
+	jz	L$aad_done__func1
+	vmovdqu	256-16(%rsi),%xmm9
+L$aad_loop_blockbyblock__func1:
+	vmovdqu	(%rdx),%xmm0
+	vpshufb	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm9,%xmm5,%xmm0
+	vpclmulqdq	$0x01,%xmm9,%xmm5,%xmm1
+	vpclmulqdq	$0x10,%xmm9,%xmm5,%xmm2
+	vpxord	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm10,%xmm2
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpternlogd	$0x96,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x11,%xmm9,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm1,%xmm10,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpternlogd	$0x96,%xmm0,%xmm1,%xmm5
+
+	addq	$16,%rdx
+	subq	$16,%rcx
+	jnz	L$aad_loop_blockbyblock__func1
+
+L$aad_done__func1:
+
+	vpshufb	%xmm4,%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rdi)
+	ret
+
+
+
+.globl	_aes_gcm_enc_update_vaes_avx10_256
+.private_extern _aes_gcm_enc_update_vaes_avx10_256
+
+.p2align	5
+_aes_gcm_enc_update_vaes_avx10_256:
+
+
+_CET_ENDBR
+	pushq	%r12
+
+
+	movq	16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	movb	$1,_BORINGSSL_function_hit+6(%rip)
+#endif
+
+	vbroadcasti32x4	L$bswap_mask(%rip),%ymm8
+	vbroadcasti32x4	L$gfpoly(%rip),%ymm31
+
+
+
+	vmovdqu	(%r12),%xmm10
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vbroadcasti32x4	(%r8),%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm12
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti32x4	(%rcx),%ymm13
+	vbroadcasti32x4	(%r11),%ymm14
+
+
+	vpaddd	L$ctr_pattern(%rip),%ymm12,%ymm12
+
+
+	vbroadcasti32x4	L$inc_2blocks(%rip),%ymm11
+
+
+
+	cmpq	$128-1,%rdx
+	jbe	L$crypt_loop_4x_done__func1
+
+
+	vmovdqu8	256-128(%r9),%ymm27
+	vmovdqu8	256-96(%r9),%ymm28
+	vmovdqu8	256-64(%r9),%ymm29
+	vmovdqu8	256-32(%r9),%ymm30
+
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm1
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm2
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm3
+	vpaddd	%ymm11,%ymm12,%ymm12
+
+
+	vpxord	%ymm13,%ymm0,%ymm0
+	vpxord	%ymm13,%ymm1,%ymm1
+	vpxord	%ymm13,%ymm2,%ymm2
+	vpxord	%ymm13,%ymm3,%ymm3
+
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_first_4_vecs__func1:
+	vbroadcasti32x4	(%rax),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_first_4_vecs__func1
+
+
+
+	vpxord	0(%rdi),%ymm14,%ymm4
+	vpxord	32(%rdi),%ymm14,%ymm5
+	vpxord	64(%rdi),%ymm14,%ymm6
+	vpxord	96(%rdi),%ymm14,%ymm7
+
+
+
+	vaesenclast	%ymm4,%ymm0,%ymm4
+	vaesenclast	%ymm5,%ymm1,%ymm5
+	vaesenclast	%ymm6,%ymm2,%ymm6
+	vaesenclast	%ymm7,%ymm3,%ymm7
+
+
+	vmovdqu8	%ymm4,0(%rsi)
+	vmovdqu8	%ymm5,32(%rsi)
+	vmovdqu8	%ymm6,64(%rsi)
+	vmovdqu8	%ymm7,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$128-1,%rdx
+	jbe	L$ghash_last_ciphertext_4x__func1
+	vbroadcasti32x4	-144(%r11),%ymm15
+	vbroadcasti32x4	-128(%r11),%ymm16
+	vbroadcasti32x4	-112(%r11),%ymm17
+	vbroadcasti32x4	-96(%r11),%ymm18
+	vbroadcasti32x4	-80(%r11),%ymm19
+	vbroadcasti32x4	-64(%r11),%ymm20
+	vbroadcasti32x4	-48(%r11),%ymm21
+	vbroadcasti32x4	-32(%r11),%ymm22
+	vbroadcasti32x4	-16(%r11),%ymm23
+L$crypt_loop_4x__func1:
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm1
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm2
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm3
+	vpaddd	%ymm11,%ymm12,%ymm12
+
+
+	vpxord	%ymm13,%ymm0,%ymm0
+	vpxord	%ymm13,%ymm1,%ymm1
+	vpxord	%ymm13,%ymm2,%ymm2
+	vpxord	%ymm13,%ymm3,%ymm3
+
+	cmpl	$24,%r10d
+	jl	L$aes128__func1
+	je	L$aes192__func1
+
+	vbroadcasti32x4	-208(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+	vbroadcasti32x4	-192(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+L$aes192__func1:
+	vbroadcasti32x4	-176(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+	vbroadcasti32x4	-160(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+L$aes128__func1:
+	vpshufb	%ymm8,%ymm4,%ymm4
+	vpxord	%ymm10,%ymm4,%ymm4
+	vpshufb	%ymm8,%ymm5,%ymm5
+	vpshufb	%ymm8,%ymm6,%ymm6
+
+	vaesenc	%ymm15,%ymm0,%ymm0
+	vaesenc	%ymm15,%ymm1,%ymm1
+	vaesenc	%ymm15,%ymm2,%ymm2
+	vaesenc	%ymm15,%ymm3,%ymm3
+
+	vpshufb	%ymm8,%ymm7,%ymm7
+	vpclmulqdq	$0x00,%ymm27,%ymm4,%ymm10
+	vpclmulqdq	$0x00,%ymm28,%ymm5,%ymm24
+	vpclmulqdq	$0x00,%ymm29,%ymm6,%ymm25
+
+	vaesenc	%ymm16,%ymm0,%ymm0
+	vaesenc	%ymm16,%ymm1,%ymm1
+	vaesenc	%ymm16,%ymm2,%ymm2
+	vaesenc	%ymm16,%ymm3,%ymm3
+
+	vpxord	%ymm24,%ymm10,%ymm10
+	vpclmulqdq	$0x00,%ymm30,%ymm7,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm10
+	vpclmulqdq	$0x01,%ymm27,%ymm4,%ymm24
+
+	vaesenc	%ymm17,%ymm0,%ymm0
+	vaesenc	%ymm17,%ymm1,%ymm1
+	vaesenc	%ymm17,%ymm2,%ymm2
+	vaesenc	%ymm17,%ymm3,%ymm3
+
+	vpclmulqdq	$0x01,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x01,%ymm29,%ymm6,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm30,%ymm7,%ymm25
+
+	vaesenc	%ymm18,%ymm0,%ymm0
+	vaesenc	%ymm18,%ymm1,%ymm1
+	vaesenc	%ymm18,%ymm2,%ymm2
+	vaesenc	%ymm18,%ymm3,%ymm3
+
+	vpclmulqdq	$0x10,%ymm27,%ymm4,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x10,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x10,%ymm29,%ymm6,%ymm26
+
+	vaesenc	%ymm19,%ymm0,%ymm0
+	vaesenc	%ymm19,%ymm1,%ymm1
+	vaesenc	%ymm19,%ymm2,%ymm2
+	vaesenc	%ymm19,%ymm3,%ymm3
+
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm10,%ymm31,%ymm26
+	vpclmulqdq	$0x10,%ymm30,%ymm7,%ymm25
+	vpxord	%ymm25,%ymm24,%ymm24
+
+	vaesenc	%ymm20,%ymm0,%ymm0
+	vaesenc	%ymm20,%ymm1,%ymm1
+	vaesenc	%ymm20,%ymm2,%ymm2
+	vaesenc	%ymm20,%ymm3,%ymm3
+
+	vpshufd	$0x4e,%ymm10,%ymm10
+	vpclmulqdq	$0x11,%ymm27,%ymm4,%ymm4
+	vpclmulqdq	$0x11,%ymm28,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm29,%ymm6,%ymm6
+
+	vaesenc	%ymm21,%ymm0,%ymm0
+	vaesenc	%ymm21,%ymm1,%ymm1
+	vaesenc	%ymm21,%ymm2,%ymm2
+	vaesenc	%ymm21,%ymm3,%ymm3
+
+	vpternlogd	$0x96,%ymm26,%ymm10,%ymm24
+	vpclmulqdq	$0x11,%ymm30,%ymm7,%ymm7
+	vpternlogd	$0x96,%ymm6,%ymm5,%ymm4
+	vpclmulqdq	$0x01,%ymm24,%ymm31,%ymm25
+
+	vaesenc	%ymm22,%ymm0,%ymm0
+	vaesenc	%ymm22,%ymm1,%ymm1
+	vaesenc	%ymm22,%ymm2,%ymm2
+	vaesenc	%ymm22,%ymm3,%ymm3
+
+	vpxord	%ymm7,%ymm4,%ymm10
+	vpshufd	$0x4e,%ymm24,%ymm24
+	vpternlogd	$0x96,%ymm25,%ymm24,%ymm10
+
+	vaesenc	%ymm23,%ymm0,%ymm0
+	vaesenc	%ymm23,%ymm1,%ymm1
+	vaesenc	%ymm23,%ymm2,%ymm2
+	vaesenc	%ymm23,%ymm3,%ymm3
+
+	vextracti32x4	$1,%ymm10,%xmm4
+	vpxord	%xmm4,%xmm10,%xmm10
+
+
+
+
+	vpxord	0(%rdi),%ymm14,%ymm4
+	vpxord	32(%rdi),%ymm14,%ymm5
+	vpxord	64(%rdi),%ymm14,%ymm6
+	vpxord	96(%rdi),%ymm14,%ymm7
+
+
+
+	vaesenclast	%ymm4,%ymm0,%ymm4
+	vaesenclast	%ymm5,%ymm1,%ymm5
+	vaesenclast	%ymm6,%ymm2,%ymm6
+	vaesenclast	%ymm7,%ymm3,%ymm7
+
+
+	vmovdqu8	%ymm4,0(%rsi)
+	vmovdqu8	%ymm5,32(%rsi)
+	vmovdqu8	%ymm6,64(%rsi)
+	vmovdqu8	%ymm7,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$128-1,%rdx
+	ja	L$crypt_loop_4x__func1
+L$ghash_last_ciphertext_4x__func1:
+	vpshufb	%ymm8,%ymm4,%ymm4
+	vpxord	%ymm10,%ymm4,%ymm4
+	vpshufb	%ymm8,%ymm5,%ymm5
+	vpshufb	%ymm8,%ymm6,%ymm6
+	vpshufb	%ymm8,%ymm7,%ymm7
+	vpclmulqdq	$0x00,%ymm27,%ymm4,%ymm10
+	vpclmulqdq	$0x00,%ymm28,%ymm5,%ymm24
+	vpclmulqdq	$0x00,%ymm29,%ymm6,%ymm25
+	vpxord	%ymm24,%ymm10,%ymm10
+	vpclmulqdq	$0x00,%ymm30,%ymm7,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm10
+	vpclmulqdq	$0x01,%ymm27,%ymm4,%ymm24
+	vpclmulqdq	$0x01,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x01,%ymm29,%ymm6,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm30,%ymm7,%ymm25
+	vpclmulqdq	$0x10,%ymm27,%ymm4,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x10,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x10,%ymm29,%ymm6,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm10,%ymm31,%ymm26
+	vpclmulqdq	$0x10,%ymm30,%ymm7,%ymm25
+	vpxord	%ymm25,%ymm24,%ymm24
+	vpshufd	$0x4e,%ymm10,%ymm10
+	vpclmulqdq	$0x11,%ymm27,%ymm4,%ymm4
+	vpclmulqdq	$0x11,%ymm28,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm29,%ymm6,%ymm6
+	vpternlogd	$0x96,%ymm26,%ymm10,%ymm24
+	vpclmulqdq	$0x11,%ymm30,%ymm7,%ymm7
+	vpternlogd	$0x96,%ymm6,%ymm5,%ymm4
+	vpclmulqdq	$0x01,%ymm24,%ymm31,%ymm25
+	vpxord	%ymm7,%ymm4,%ymm10
+	vpshufd	$0x4e,%ymm24,%ymm24
+	vpternlogd	$0x96,%ymm25,%ymm24,%ymm10
+	vextracti32x4	$1,%ymm10,%xmm4
+	vpxord	%xmm4,%xmm10,%xmm10
+
+L$crypt_loop_4x_done__func1:
+
+	testq	%rdx,%rdx
+	jz	L$done__func1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdx,%rax
+	negq	%rax
+	andq	$-16,%rax
+	leaq	256(%r9,%rax,1),%r8
+	vpxor	%xmm4,%xmm4,%xmm4
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+
+	cmpq	$32,%rdx
+	jb	L$partial_vec__func1
+
+L$crypt_loop_1x__func1:
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpxord	%ymm13,%ymm0,%ymm0
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_full_vec__func1:
+	vbroadcasti32x4	(%rax),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_full_vec__func1
+	vaesenclast	%ymm14,%ymm0,%ymm0
+
+
+	vmovdqu8	(%rdi),%ymm1
+	vpxord	%ymm1,%ymm0,%ymm0
+	vmovdqu8	%ymm0,(%rsi)
+
+
+	vmovdqu8	(%r8),%ymm30
+	vpshufb	%ymm8,%ymm0,%ymm0
+	vpxord	%ymm10,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm30,%ymm0,%ymm7
+	vpclmulqdq	$0x01,%ymm30,%ymm0,%ymm1
+	vpclmulqdq	$0x10,%ymm30,%ymm0,%ymm2
+	vpclmulqdq	$0x11,%ymm30,%ymm0,%ymm3
+	vpxord	%ymm7,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm2,%ymm1,%ymm5
+	vpxord	%ymm3,%ymm6,%ymm6
+
+	vpxor	%xmm10,%xmm10,%xmm10
+
+	addq	$32,%r8
+	addq	$32,%rdi
+	addq	$32,%rsi
+	subq	$32,%rdx
+	cmpq	$32,%rdx
+	jae	L$crypt_loop_1x__func1
+
+	testq	%rdx,%rdx
+	jz	L$reduce__func1
+
+L$partial_vec__func1:
+
+
+
+
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovd	%eax,%k1
+	addq	$15,%rdx
+	andq	$-16,%rdx
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovd	%eax,%k2
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpxord	%ymm13,%ymm0,%ymm0
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_partialvec__func1:
+	vbroadcasti32x4	(%rax),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_partialvec__func1
+	vaesenclast	%ymm14,%ymm0,%ymm0
+
+
+	vmovdqu8	(%rdi),%ymm1{%k1}{z}
+	vpxord	%ymm1,%ymm0,%ymm0
+	vmovdqu8	%ymm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	(%r8),%ymm30{%k2}{z}
+	vmovdqu8	%ymm0,%ymm1{%k1}{z}
+	vpshufb	%ymm8,%ymm1,%ymm0
+	vpxord	%ymm10,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm30,%ymm0,%ymm7
+	vpclmulqdq	$0x01,%ymm30,%ymm0,%ymm1
+	vpclmulqdq	$0x10,%ymm30,%ymm0,%ymm2
+	vpclmulqdq	$0x11,%ymm30,%ymm0,%ymm3
+	vpxord	%ymm7,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm2,%ymm1,%ymm5
+	vpxord	%ymm3,%ymm6,%ymm6
+
+
+L$reduce__func1:
+
+	vpclmulqdq	$0x01,%ymm4,%ymm31,%ymm0
+	vpshufd	$0x4e,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm0,%ymm4,%ymm5
+	vpclmulqdq	$0x01,%ymm5,%ymm31,%ymm0
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpternlogd	$0x96,%ymm0,%ymm5,%ymm6
+
+	vextracti32x4	$1,%ymm6,%xmm0
+	vpxord	%xmm0,%xmm6,%xmm10
+
+
+L$done__func1:
+
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%r12)
+
+	vzeroupper
+	popq	%r12
+
+	ret
+
+
+
+.globl	_aes_gcm_dec_update_vaes_avx10_256
+.private_extern _aes_gcm_dec_update_vaes_avx10_256
+
+.p2align	5
+_aes_gcm_dec_update_vaes_avx10_256:
+
+
+_CET_ENDBR
+	pushq	%r12
+
+
+	movq	16(%rsp),%r12
+
+	vbroadcasti32x4	L$bswap_mask(%rip),%ymm8
+	vbroadcasti32x4	L$gfpoly(%rip),%ymm31
+
+
+
+	vmovdqu	(%r12),%xmm10
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vbroadcasti32x4	(%r8),%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm12
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti32x4	(%rcx),%ymm13
+	vbroadcasti32x4	(%r11),%ymm14
+
+
+	vpaddd	L$ctr_pattern(%rip),%ymm12,%ymm12
+
+
+	vbroadcasti32x4	L$inc_2blocks(%rip),%ymm11
+
+
+
+	cmpq	$128-1,%rdx
+	jbe	L$crypt_loop_4x_done__func2
+
+
+	vmovdqu8	256-128(%r9),%ymm27
+	vmovdqu8	256-96(%r9),%ymm28
+	vmovdqu8	256-64(%r9),%ymm29
+	vmovdqu8	256-32(%r9),%ymm30
+	vbroadcasti32x4	-144(%r11),%ymm15
+	vbroadcasti32x4	-128(%r11),%ymm16
+	vbroadcasti32x4	-112(%r11),%ymm17
+	vbroadcasti32x4	-96(%r11),%ymm18
+	vbroadcasti32x4	-80(%r11),%ymm19
+	vbroadcasti32x4	-64(%r11),%ymm20
+	vbroadcasti32x4	-48(%r11),%ymm21
+	vbroadcasti32x4	-32(%r11),%ymm22
+	vbroadcasti32x4	-16(%r11),%ymm23
+L$crypt_loop_4x__func2:
+	vmovdqu8	0(%rdi),%ymm4
+	vmovdqu8	32(%rdi),%ymm5
+	vmovdqu8	64(%rdi),%ymm6
+	vmovdqu8	96(%rdi),%ymm7
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm1
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm2
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm3
+	vpaddd	%ymm11,%ymm12,%ymm12
+
+
+	vpxord	%ymm13,%ymm0,%ymm0
+	vpxord	%ymm13,%ymm1,%ymm1
+	vpxord	%ymm13,%ymm2,%ymm2
+	vpxord	%ymm13,%ymm3,%ymm3
+
+	cmpl	$24,%r10d
+	jl	L$aes128__func2
+	je	L$aes192__func2
+
+	vbroadcasti32x4	-208(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+	vbroadcasti32x4	-192(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+L$aes192__func2:
+	vbroadcasti32x4	-176(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+	vbroadcasti32x4	-160(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+L$aes128__func2:
+	vpshufb	%ymm8,%ymm4,%ymm4
+	vpxord	%ymm10,%ymm4,%ymm4
+	vpshufb	%ymm8,%ymm5,%ymm5
+	vpshufb	%ymm8,%ymm6,%ymm6
+
+	vaesenc	%ymm15,%ymm0,%ymm0
+	vaesenc	%ymm15,%ymm1,%ymm1
+	vaesenc	%ymm15,%ymm2,%ymm2
+	vaesenc	%ymm15,%ymm3,%ymm3
+
+	vpshufb	%ymm8,%ymm7,%ymm7
+	vpclmulqdq	$0x00,%ymm27,%ymm4,%ymm10
+	vpclmulqdq	$0x00,%ymm28,%ymm5,%ymm24
+	vpclmulqdq	$0x00,%ymm29,%ymm6,%ymm25
+
+	vaesenc	%ymm16,%ymm0,%ymm0
+	vaesenc	%ymm16,%ymm1,%ymm1
+	vaesenc	%ymm16,%ymm2,%ymm2
+	vaesenc	%ymm16,%ymm3,%ymm3
+
+	vpxord	%ymm24,%ymm10,%ymm10
+	vpclmulqdq	$0x00,%ymm30,%ymm7,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm10
+	vpclmulqdq	$0x01,%ymm27,%ymm4,%ymm24
+
+	vaesenc	%ymm17,%ymm0,%ymm0
+	vaesenc	%ymm17,%ymm1,%ymm1
+	vaesenc	%ymm17,%ymm2,%ymm2
+	vaesenc	%ymm17,%ymm3,%ymm3
+
+	vpclmulqdq	$0x01,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x01,%ymm29,%ymm6,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm30,%ymm7,%ymm25
+
+	vaesenc	%ymm18,%ymm0,%ymm0
+	vaesenc	%ymm18,%ymm1,%ymm1
+	vaesenc	%ymm18,%ymm2,%ymm2
+	vaesenc	%ymm18,%ymm3,%ymm3
+
+	vpclmulqdq	$0x10,%ymm27,%ymm4,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x10,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x10,%ymm29,%ymm6,%ymm26
+
+	vaesenc	%ymm19,%ymm0,%ymm0
+	vaesenc	%ymm19,%ymm1,%ymm1
+	vaesenc	%ymm19,%ymm2,%ymm2
+	vaesenc	%ymm19,%ymm3,%ymm3
+
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm10,%ymm31,%ymm26
+	vpclmulqdq	$0x10,%ymm30,%ymm7,%ymm25
+	vpxord	%ymm25,%ymm24,%ymm24
+
+	vaesenc	%ymm20,%ymm0,%ymm0
+	vaesenc	%ymm20,%ymm1,%ymm1
+	vaesenc	%ymm20,%ymm2,%ymm2
+	vaesenc	%ymm20,%ymm3,%ymm3
+
+	vpshufd	$0x4e,%ymm10,%ymm10
+	vpclmulqdq	$0x11,%ymm27,%ymm4,%ymm4
+	vpclmulqdq	$0x11,%ymm28,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm29,%ymm6,%ymm6
+
+	vaesenc	%ymm21,%ymm0,%ymm0
+	vaesenc	%ymm21,%ymm1,%ymm1
+	vaesenc	%ymm21,%ymm2,%ymm2
+	vaesenc	%ymm21,%ymm3,%ymm3
+
+	vpternlogd	$0x96,%ymm26,%ymm10,%ymm24
+	vpclmulqdq	$0x11,%ymm30,%ymm7,%ymm7
+	vpternlogd	$0x96,%ymm6,%ymm5,%ymm4
+	vpclmulqdq	$0x01,%ymm24,%ymm31,%ymm25
+
+	vaesenc	%ymm22,%ymm0,%ymm0
+	vaesenc	%ymm22,%ymm1,%ymm1
+	vaesenc	%ymm22,%ymm2,%ymm2
+	vaesenc	%ymm22,%ymm3,%ymm3
+
+	vpxord	%ymm7,%ymm4,%ymm10
+	vpshufd	$0x4e,%ymm24,%ymm24
+	vpternlogd	$0x96,%ymm25,%ymm24,%ymm10
+
+	vaesenc	%ymm23,%ymm0,%ymm0
+	vaesenc	%ymm23,%ymm1,%ymm1
+	vaesenc	%ymm23,%ymm2,%ymm2
+	vaesenc	%ymm23,%ymm3,%ymm3
+
+	vextracti32x4	$1,%ymm10,%xmm4
+	vpxord	%xmm4,%xmm10,%xmm10
+
+
+
+
+	vpxord	0(%rdi),%ymm14,%ymm4
+	vpxord	32(%rdi),%ymm14,%ymm5
+	vpxord	64(%rdi),%ymm14,%ymm6
+	vpxord	96(%rdi),%ymm14,%ymm7
+
+
+
+	vaesenclast	%ymm4,%ymm0,%ymm4
+	vaesenclast	%ymm5,%ymm1,%ymm5
+	vaesenclast	%ymm6,%ymm2,%ymm6
+	vaesenclast	%ymm7,%ymm3,%ymm7
+
+
+	vmovdqu8	%ymm4,0(%rsi)
+	vmovdqu8	%ymm5,32(%rsi)
+	vmovdqu8	%ymm6,64(%rsi)
+	vmovdqu8	%ymm7,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$128-1,%rdx
+	ja	L$crypt_loop_4x__func2
+L$crypt_loop_4x_done__func2:
+
+	testq	%rdx,%rdx
+	jz	L$done__func2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdx,%rax
+	negq	%rax
+	andq	$-16,%rax
+	leaq	256(%r9,%rax,1),%r8
+	vpxor	%xmm4,%xmm4,%xmm4
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+
+	cmpq	$32,%rdx
+	jb	L$partial_vec__func2
+
+L$crypt_loop_1x__func2:
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpxord	%ymm13,%ymm0,%ymm0
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_full_vec__func2:
+	vbroadcasti32x4	(%rax),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_full_vec__func2
+	vaesenclast	%ymm14,%ymm0,%ymm0
+
+
+	vmovdqu8	(%rdi),%ymm1
+	vpxord	%ymm1,%ymm0,%ymm0
+	vmovdqu8	%ymm0,(%rsi)
+
+
+	vmovdqu8	(%r8),%ymm30
+	vpshufb	%ymm8,%ymm1,%ymm0
+	vpxord	%ymm10,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm30,%ymm0,%ymm7
+	vpclmulqdq	$0x01,%ymm30,%ymm0,%ymm1
+	vpclmulqdq	$0x10,%ymm30,%ymm0,%ymm2
+	vpclmulqdq	$0x11,%ymm30,%ymm0,%ymm3
+	vpxord	%ymm7,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm2,%ymm1,%ymm5
+	vpxord	%ymm3,%ymm6,%ymm6
+
+	vpxor	%xmm10,%xmm10,%xmm10
+
+	addq	$32,%r8
+	addq	$32,%rdi
+	addq	$32,%rsi
+	subq	$32,%rdx
+	cmpq	$32,%rdx
+	jae	L$crypt_loop_1x__func2
+
+	testq	%rdx,%rdx
+	jz	L$reduce__func2
+
+L$partial_vec__func2:
+
+
+
+
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovd	%eax,%k1
+	addq	$15,%rdx
+	andq	$-16,%rdx
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovd	%eax,%k2
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpxord	%ymm13,%ymm0,%ymm0
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_partialvec__func2:
+	vbroadcasti32x4	(%rax),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_partialvec__func2
+	vaesenclast	%ymm14,%ymm0,%ymm0
+
+
+	vmovdqu8	(%rdi),%ymm1{%k1}{z}
+	vpxord	%ymm1,%ymm0,%ymm0
+	vmovdqu8	%ymm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	(%r8),%ymm30{%k2}{z}
+
+	vpshufb	%ymm8,%ymm1,%ymm0
+	vpxord	%ymm10,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm30,%ymm0,%ymm7
+	vpclmulqdq	$0x01,%ymm30,%ymm0,%ymm1
+	vpclmulqdq	$0x10,%ymm30,%ymm0,%ymm2
+	vpclmulqdq	$0x11,%ymm30,%ymm0,%ymm3
+	vpxord	%ymm7,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm2,%ymm1,%ymm5
+	vpxord	%ymm3,%ymm6,%ymm6
+
+
+L$reduce__func2:
+
+	vpclmulqdq	$0x01,%ymm4,%ymm31,%ymm0
+	vpshufd	$0x4e,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm0,%ymm4,%ymm5
+	vpclmulqdq	$0x01,%ymm5,%ymm31,%ymm0
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpternlogd	$0x96,%ymm0,%ymm5,%ymm6
+
+	vextracti32x4	$1,%ymm6,%xmm0
+	vpxord	%xmm0,%xmm6,%xmm10
+
+
+L$done__func2:
+
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%r12)
+
+	vzeroupper
+	popq	%r12
+
+	ret
+
+
+
+.globl	_gcm_ghash_vpclmulqdq_avx10_512
+.private_extern _gcm_ghash_vpclmulqdq_avx10_512
+
+.p2align	5
+_gcm_ghash_vpclmulqdq_avx10_512:
+
+
+_CET_ENDBR
+
+
+
+
+
+
+	vmovdqu	L$bswap_mask(%rip),%xmm4
+	vmovdqu	L$gfpoly(%rip),%xmm10
+
+
+	vmovdqu	(%rdi),%xmm5
+	vpshufb	%xmm4,%xmm5,%xmm5
+
+
+	cmpq	$64,%rcx
+	jb	L$aad_blockbyblock__func2
+
+
+
+	vshufi64x2	$0,%zmm4,%zmm4,%zmm4
+	vshufi64x2	$0,%zmm10,%zmm10,%zmm10
+
+
+	vmovdqu8	256-64(%rsi),%zmm9
+
+	cmpq	$256-1,%rcx
+	jbe	L$aad_loop_1x__func2
+
+
+	vmovdqu8	256-256(%rsi),%zmm6
+	vmovdqu8	256-192(%rsi),%zmm7
+	vmovdqu8	256-128(%rsi),%zmm8
+
+
+L$aad_loop_4x__func2:
+	vmovdqu8	0(%rdx),%zmm0
+	vmovdqu8	64(%rdx),%zmm1
+	vmovdqu8	128(%rdx),%zmm2
+	vmovdqu8	192(%rdx),%zmm3
+	vpshufb	%zmm4,%zmm0,%zmm0
+	vpxord	%zmm5,%zmm0,%zmm0
+	vpshufb	%zmm4,%zmm1,%zmm1
+	vpshufb	%zmm4,%zmm2,%zmm2
+	vpshufb	%zmm4,%zmm3,%zmm3
+	vpclmulqdq	$0x00,%zmm6,%zmm0,%zmm5
+	vpclmulqdq	$0x00,%zmm7,%zmm1,%zmm11
+	vpclmulqdq	$0x00,%zmm8,%zmm2,%zmm12
+	vpxord	%zmm11,%zmm5,%zmm5
+	vpclmulqdq	$0x00,%zmm9,%zmm3,%zmm13
+	vpternlogd	$0x96,%zmm13,%zmm12,%zmm5
+	vpclmulqdq	$0x01,%zmm6,%zmm0,%zmm11
+	vpclmulqdq	$0x01,%zmm7,%zmm1,%zmm12
+	vpclmulqdq	$0x01,%zmm8,%zmm2,%zmm13
+	vpternlogd	$0x96,%zmm13,%zmm12,%zmm11
+	vpclmulqdq	$0x01,%zmm9,%zmm3,%zmm12
+	vpclmulqdq	$0x10,%zmm6,%zmm0,%zmm13
+	vpternlogd	$0x96,%zmm13,%zmm12,%zmm11
+	vpclmulqdq	$0x10,%zmm7,%zmm1,%zmm12
+	vpclmulqdq	$0x10,%zmm8,%zmm2,%zmm13
+	vpternlogd	$0x96,%zmm13,%zmm12,%zmm11
+	vpclmulqdq	$0x01,%zmm5,%zmm10,%zmm13
+	vpclmulqdq	$0x10,%zmm9,%zmm3,%zmm12
+	vpxord	%zmm12,%zmm11,%zmm11
+	vpshufd	$0x4e,%zmm5,%zmm5
+	vpclmulqdq	$0x11,%zmm6,%zmm0,%zmm0
+	vpclmulqdq	$0x11,%zmm7,%zmm1,%zmm1
+	vpclmulqdq	$0x11,%zmm8,%zmm2,%zmm2
+	vpternlogd	$0x96,%zmm13,%zmm5,%zmm11
+	vpclmulqdq	$0x11,%zmm9,%zmm3,%zmm3
+	vpternlogd	$0x96,%zmm2,%zmm1,%zmm0
+	vpclmulqdq	$0x01,%zmm11,%zmm10,%zmm12
+	vpxord	%zmm3,%zmm0,%zmm5
+	vpshufd	$0x4e,%zmm11,%zmm11
+	vpternlogd	$0x96,%zmm12,%zmm11,%zmm5
+	vextracti32x4	$1,%zmm5,%xmm0
+	vextracti32x4	$2,%zmm5,%xmm1
+	vextracti32x4	$3,%zmm5,%xmm2
+	vpxord	%xmm0,%xmm5,%xmm5
+	vpternlogd	$0x96,%xmm1,%xmm2,%xmm5
+
+	subq	$-256,%rdx
+	addq	$-256,%rcx
+	cmpq	$256-1,%rcx
+	ja	L$aad_loop_4x__func2
+
+
+	cmpq	$64,%rcx
+	jb	L$aad_large_done__func2
+L$aad_loop_1x__func2:
+	vmovdqu8	(%rdx),%zmm0
+	vpshufb	%zmm4,%zmm0,%zmm0
+	vpxord	%zmm0,%zmm5,%zmm5
+	vpclmulqdq	$0x00,%zmm9,%zmm5,%zmm0
+	vpclmulqdq	$0x01,%zmm9,%zmm5,%zmm1
+	vpclmulqdq	$0x10,%zmm9,%zmm5,%zmm2
+	vpxord	%zmm2,%zmm1,%zmm1
+	vpclmulqdq	$0x01,%zmm0,%zmm10,%zmm2
+	vpshufd	$0x4e,%zmm0,%zmm0
+	vpternlogd	$0x96,%zmm2,%zmm0,%zmm1
+	vpclmulqdq	$0x11,%zmm9,%zmm5,%zmm5
+	vpclmulqdq	$0x01,%zmm1,%zmm10,%zmm0
+	vpshufd	$0x4e,%zmm1,%zmm1
+	vpternlogd	$0x96,%zmm0,%zmm1,%zmm5
+
+	vextracti32x4	$1,%zmm5,%xmm0
+	vextracti32x4	$2,%zmm5,%xmm1
+	vextracti32x4	$3,%zmm5,%xmm2
+	vpxord	%xmm0,%xmm5,%xmm5
+	vpternlogd	$0x96,%xmm1,%xmm2,%xmm5
+
+	addq	$64,%rdx
+	subq	$64,%rcx
+	cmpq	$64,%rcx
+	jae	L$aad_loop_1x__func2
+
+L$aad_large_done__func2:
+
+
+	vzeroupper
+
+
+L$aad_blockbyblock__func2:
+	testq	%rcx,%rcx
+	jz	L$aad_done__func2
+	vmovdqu	256-16(%rsi),%xmm9
+L$aad_loop_blockbyblock__func2:
+	vmovdqu	(%rdx),%xmm0
+	vpshufb	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm9,%xmm5,%xmm0
+	vpclmulqdq	$0x01,%xmm9,%xmm5,%xmm1
+	vpclmulqdq	$0x10,%xmm9,%xmm5,%xmm2
+	vpxord	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm10,%xmm2
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpternlogd	$0x96,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x11,%xmm9,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm1,%xmm10,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpternlogd	$0x96,%xmm0,%xmm1,%xmm5
+
+	addq	$16,%rdx
+	subq	$16,%rcx
+	jnz	L$aad_loop_blockbyblock__func2
+
+L$aad_done__func2:
+
+	vpshufb	%xmm4,%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rdi)
+	ret
+
+
+
+.globl	_aes_gcm_enc_update_vaes_avx10_512
+.private_extern _aes_gcm_enc_update_vaes_avx10_512
+
+.p2align	5
+_aes_gcm_enc_update_vaes_avx10_512:
+
+
+_CET_ENDBR
+	pushq	%r12
+
+
+	movq	16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	movb	$1,_BORINGSSL_function_hit+7(%rip)
+#endif
+
+	vbroadcasti32x4	L$bswap_mask(%rip),%zmm8
+	vbroadcasti32x4	L$gfpoly(%rip),%zmm31
+
+
+
+	vmovdqu	(%r12),%xmm10
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vbroadcasti32x4	(%r8),%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm12
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti32x4	(%rcx),%zmm13
+	vbroadcasti32x4	(%r11),%zmm14
+
+
+	vpaddd	L$ctr_pattern(%rip),%zmm12,%zmm12
+
+
+	vbroadcasti32x4	L$inc_4blocks(%rip),%zmm11
+
+
+
+	cmpq	$256-1,%rdx
+	jbe	L$crypt_loop_4x_done__func3
+
+
+	vmovdqu8	256-256(%r9),%zmm27
+	vmovdqu8	256-192(%r9),%zmm28
+	vmovdqu8	256-128(%r9),%zmm29
+	vmovdqu8	256-64(%r9),%zmm30
+
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm1
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm2
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm3
+	vpaddd	%zmm11,%zmm12,%zmm12
+
+
+	vpxord	%zmm13,%zmm0,%zmm0
+	vpxord	%zmm13,%zmm1,%zmm1
+	vpxord	%zmm13,%zmm2,%zmm2
+	vpxord	%zmm13,%zmm3,%zmm3
+
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_first_4_vecs__func3:
+	vbroadcasti32x4	(%rax),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_first_4_vecs__func3
+
+
+
+	vpxord	0(%rdi),%zmm14,%zmm4
+	vpxord	64(%rdi),%zmm14,%zmm5
+	vpxord	128(%rdi),%zmm14,%zmm6
+	vpxord	192(%rdi),%zmm14,%zmm7
+
+
+
+	vaesenclast	%zmm4,%zmm0,%zmm4
+	vaesenclast	%zmm5,%zmm1,%zmm5
+	vaesenclast	%zmm6,%zmm2,%zmm6
+	vaesenclast	%zmm7,%zmm3,%zmm7
+
+
+	vmovdqu8	%zmm4,0(%rsi)
+	vmovdqu8	%zmm5,64(%rsi)
+	vmovdqu8	%zmm6,128(%rsi)
+	vmovdqu8	%zmm7,192(%rsi)
+
+	subq	$-256,%rdi
+	subq	$-256,%rsi
+	addq	$-256,%rdx
+	cmpq	$256-1,%rdx
+	jbe	L$ghash_last_ciphertext_4x__func3
+	vbroadcasti32x4	-144(%r11),%zmm15
+	vbroadcasti32x4	-128(%r11),%zmm16
+	vbroadcasti32x4	-112(%r11),%zmm17
+	vbroadcasti32x4	-96(%r11),%zmm18
+	vbroadcasti32x4	-80(%r11),%zmm19
+	vbroadcasti32x4	-64(%r11),%zmm20
+	vbroadcasti32x4	-48(%r11),%zmm21
+	vbroadcasti32x4	-32(%r11),%zmm22
+	vbroadcasti32x4	-16(%r11),%zmm23
+L$crypt_loop_4x__func3:
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm1
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm2
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm3
+	vpaddd	%zmm11,%zmm12,%zmm12
+
+
+	vpxord	%zmm13,%zmm0,%zmm0
+	vpxord	%zmm13,%zmm1,%zmm1
+	vpxord	%zmm13,%zmm2,%zmm2
+	vpxord	%zmm13,%zmm3,%zmm3
+
+	cmpl	$24,%r10d
+	jl	L$aes128__func3
+	je	L$aes192__func3
+
+	vbroadcasti32x4	-208(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+	vbroadcasti32x4	-192(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+L$aes192__func3:
+	vbroadcasti32x4	-176(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+	vbroadcasti32x4	-160(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+L$aes128__func3:
+	vpshufb	%zmm8,%zmm4,%zmm4
+	vpxord	%zmm10,%zmm4,%zmm4
+	vpshufb	%zmm8,%zmm5,%zmm5
+	vpshufb	%zmm8,%zmm6,%zmm6
+
+	vaesenc	%zmm15,%zmm0,%zmm0
+	vaesenc	%zmm15,%zmm1,%zmm1
+	vaesenc	%zmm15,%zmm2,%zmm2
+	vaesenc	%zmm15,%zmm3,%zmm3
+
+	vpshufb	%zmm8,%zmm7,%zmm7
+	vpclmulqdq	$0x00,%zmm27,%zmm4,%zmm10
+	vpclmulqdq	$0x00,%zmm28,%zmm5,%zmm24
+	vpclmulqdq	$0x00,%zmm29,%zmm6,%zmm25
+
+	vaesenc	%zmm16,%zmm0,%zmm0
+	vaesenc	%zmm16,%zmm1,%zmm1
+	vaesenc	%zmm16,%zmm2,%zmm2
+	vaesenc	%zmm16,%zmm3,%zmm3
+
+	vpxord	%zmm24,%zmm10,%zmm10
+	vpclmulqdq	$0x00,%zmm30,%zmm7,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm10
+	vpclmulqdq	$0x01,%zmm27,%zmm4,%zmm24
+
+	vaesenc	%zmm17,%zmm0,%zmm0
+	vaesenc	%zmm17,%zmm1,%zmm1
+	vaesenc	%zmm17,%zmm2,%zmm2
+	vaesenc	%zmm17,%zmm3,%zmm3
+
+	vpclmulqdq	$0x01,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x01,%zmm29,%zmm6,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm30,%zmm7,%zmm25
+
+	vaesenc	%zmm18,%zmm0,%zmm0
+	vaesenc	%zmm18,%zmm1,%zmm1
+	vaesenc	%zmm18,%zmm2,%zmm2
+	vaesenc	%zmm18,%zmm3,%zmm3
+
+	vpclmulqdq	$0x10,%zmm27,%zmm4,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x10,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x10,%zmm29,%zmm6,%zmm26
+
+	vaesenc	%zmm19,%zmm0,%zmm0
+	vaesenc	%zmm19,%zmm1,%zmm1
+	vaesenc	%zmm19,%zmm2,%zmm2
+	vaesenc	%zmm19,%zmm3,%zmm3
+
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm10,%zmm31,%zmm26
+	vpclmulqdq	$0x10,%zmm30,%zmm7,%zmm25
+	vpxord	%zmm25,%zmm24,%zmm24
+
+	vaesenc	%zmm20,%zmm0,%zmm0
+	vaesenc	%zmm20,%zmm1,%zmm1
+	vaesenc	%zmm20,%zmm2,%zmm2
+	vaesenc	%zmm20,%zmm3,%zmm3
+
+	vpshufd	$0x4e,%zmm10,%zmm10
+	vpclmulqdq	$0x11,%zmm27,%zmm4,%zmm4
+	vpclmulqdq	$0x11,%zmm28,%zmm5,%zmm5
+	vpclmulqdq	$0x11,%zmm29,%zmm6,%zmm6
+
+	vaesenc	%zmm21,%zmm0,%zmm0
+	vaesenc	%zmm21,%zmm1,%zmm1
+	vaesenc	%zmm21,%zmm2,%zmm2
+	vaesenc	%zmm21,%zmm3,%zmm3
+
+	vpternlogd	$0x96,%zmm26,%zmm10,%zmm24
+	vpclmulqdq	$0x11,%zmm30,%zmm7,%zmm7
+	vpternlogd	$0x96,%zmm6,%zmm5,%zmm4
+	vpclmulqdq	$0x01,%zmm24,%zmm31,%zmm25
+
+	vaesenc	%zmm22,%zmm0,%zmm0
+	vaesenc	%zmm22,%zmm1,%zmm1
+	vaesenc	%zmm22,%zmm2,%zmm2
+	vaesenc	%zmm22,%zmm3,%zmm3
+
+	vpxord	%zmm7,%zmm4,%zmm10
+	vpshufd	$0x4e,%zmm24,%zmm24
+	vpternlogd	$0x96,%zmm25,%zmm24,%zmm10
+
+	vaesenc	%zmm23,%zmm0,%zmm0
+	vaesenc	%zmm23,%zmm1,%zmm1
+	vaesenc	%zmm23,%zmm2,%zmm2
+	vaesenc	%zmm23,%zmm3,%zmm3
+
+	vextracti32x4	$1,%zmm10,%xmm4
+	vextracti32x4	$2,%zmm10,%xmm5
+	vextracti32x4	$3,%zmm10,%xmm6
+	vpxord	%xmm4,%xmm10,%xmm10
+	vpternlogd	$0x96,%xmm5,%xmm6,%xmm10
+
+
+
+
+	vpxord	0(%rdi),%zmm14,%zmm4
+	vpxord	64(%rdi),%zmm14,%zmm5
+	vpxord	128(%rdi),%zmm14,%zmm6
+	vpxord	192(%rdi),%zmm14,%zmm7
+
+
+
+	vaesenclast	%zmm4,%zmm0,%zmm4
+	vaesenclast	%zmm5,%zmm1,%zmm5
+	vaesenclast	%zmm6,%zmm2,%zmm6
+	vaesenclast	%zmm7,%zmm3,%zmm7
+
+
+	vmovdqu8	%zmm4,0(%rsi)
+	vmovdqu8	%zmm5,64(%rsi)
+	vmovdqu8	%zmm6,128(%rsi)
+	vmovdqu8	%zmm7,192(%rsi)
+
+	subq	$-256,%rdi
+	subq	$-256,%rsi
+	addq	$-256,%rdx
+	cmpq	$256-1,%rdx
+	ja	L$crypt_loop_4x__func3
+L$ghash_last_ciphertext_4x__func3:
+	vpshufb	%zmm8,%zmm4,%zmm4
+	vpxord	%zmm10,%zmm4,%zmm4
+	vpshufb	%zmm8,%zmm5,%zmm5
+	vpshufb	%zmm8,%zmm6,%zmm6
+	vpshufb	%zmm8,%zmm7,%zmm7
+	vpclmulqdq	$0x00,%zmm27,%zmm4,%zmm10
+	vpclmulqdq	$0x00,%zmm28,%zmm5,%zmm24
+	vpclmulqdq	$0x00,%zmm29,%zmm6,%zmm25
+	vpxord	%zmm24,%zmm10,%zmm10
+	vpclmulqdq	$0x00,%zmm30,%zmm7,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm10
+	vpclmulqdq	$0x01,%zmm27,%zmm4,%zmm24
+	vpclmulqdq	$0x01,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x01,%zmm29,%zmm6,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm30,%zmm7,%zmm25
+	vpclmulqdq	$0x10,%zmm27,%zmm4,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x10,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x10,%zmm29,%zmm6,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm10,%zmm31,%zmm26
+	vpclmulqdq	$0x10,%zmm30,%zmm7,%zmm25
+	vpxord	%zmm25,%zmm24,%zmm24
+	vpshufd	$0x4e,%zmm10,%zmm10
+	vpclmulqdq	$0x11,%zmm27,%zmm4,%zmm4
+	vpclmulqdq	$0x11,%zmm28,%zmm5,%zmm5
+	vpclmulqdq	$0x11,%zmm29,%zmm6,%zmm6
+	vpternlogd	$0x96,%zmm26,%zmm10,%zmm24
+	vpclmulqdq	$0x11,%zmm30,%zmm7,%zmm7
+	vpternlogd	$0x96,%zmm6,%zmm5,%zmm4
+	vpclmulqdq	$0x01,%zmm24,%zmm31,%zmm25
+	vpxord	%zmm7,%zmm4,%zmm10
+	vpshufd	$0x4e,%zmm24,%zmm24
+	vpternlogd	$0x96,%zmm25,%zmm24,%zmm10
+	vextracti32x4	$1,%zmm10,%xmm4
+	vextracti32x4	$2,%zmm10,%xmm5
+	vextracti32x4	$3,%zmm10,%xmm6
+	vpxord	%xmm4,%xmm10,%xmm10
+	vpternlogd	$0x96,%xmm5,%xmm6,%xmm10
+
+L$crypt_loop_4x_done__func3:
+
+	testq	%rdx,%rdx
+	jz	L$done__func3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdx,%rax
+	negq	%rax
+	andq	$-16,%rax
+	leaq	256(%r9,%rax,1),%r8
+	vpxor	%xmm4,%xmm4,%xmm4
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+
+	cmpq	$64,%rdx
+	jb	L$partial_vec__func3
+
+L$crypt_loop_1x__func3:
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpxord	%zmm13,%zmm0,%zmm0
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_full_vec__func3:
+	vbroadcasti32x4	(%rax),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_full_vec__func3
+	vaesenclast	%zmm14,%zmm0,%zmm0
+
+
+	vmovdqu8	(%rdi),%zmm1
+	vpxord	%zmm1,%zmm0,%zmm0
+	vmovdqu8	%zmm0,(%rsi)
+
+
+	vmovdqu8	(%r8),%zmm30
+	vpshufb	%zmm8,%zmm0,%zmm0
+	vpxord	%zmm10,%zmm0,%zmm0
+	vpclmulqdq	$0x00,%zmm30,%zmm0,%zmm7
+	vpclmulqdq	$0x01,%zmm30,%zmm0,%zmm1
+	vpclmulqdq	$0x10,%zmm30,%zmm0,%zmm2
+	vpclmulqdq	$0x11,%zmm30,%zmm0,%zmm3
+	vpxord	%zmm7,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm2,%zmm1,%zmm5
+	vpxord	%zmm3,%zmm6,%zmm6
+
+	vpxor	%xmm10,%xmm10,%xmm10
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	cmpq	$64,%rdx
+	jae	L$crypt_loop_1x__func3
+
+	testq	%rdx,%rdx
+	jz	L$reduce__func3
+
+L$partial_vec__func3:
+
+
+
+
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovq	%rax,%k1
+	addq	$15,%rdx
+	andq	$-16,%rdx
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovq	%rax,%k2
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpxord	%zmm13,%zmm0,%zmm0
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_partialvec__func3:
+	vbroadcasti32x4	(%rax),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_partialvec__func3
+	vaesenclast	%zmm14,%zmm0,%zmm0
+
+
+	vmovdqu8	(%rdi),%zmm1{%k1}{z}
+	vpxord	%zmm1,%zmm0,%zmm0
+	vmovdqu8	%zmm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	(%r8),%zmm30{%k2}{z}
+	vmovdqu8	%zmm0,%zmm1{%k1}{z}
+	vpshufb	%zmm8,%zmm1,%zmm0
+	vpxord	%zmm10,%zmm0,%zmm0
+	vpclmulqdq	$0x00,%zmm30,%zmm0,%zmm7
+	vpclmulqdq	$0x01,%zmm30,%zmm0,%zmm1
+	vpclmulqdq	$0x10,%zmm30,%zmm0,%zmm2
+	vpclmulqdq	$0x11,%zmm30,%zmm0,%zmm3
+	vpxord	%zmm7,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm2,%zmm1,%zmm5
+	vpxord	%zmm3,%zmm6,%zmm6
+
+
+L$reduce__func3:
+
+	vpclmulqdq	$0x01,%zmm4,%zmm31,%zmm0
+	vpshufd	$0x4e,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm0,%zmm4,%zmm5
+	vpclmulqdq	$0x01,%zmm5,%zmm31,%zmm0
+	vpshufd	$0x4e,%zmm5,%zmm5
+	vpternlogd	$0x96,%zmm0,%zmm5,%zmm6
+
+	vextracti32x4	$1,%zmm6,%xmm0
+	vextracti32x4	$2,%zmm6,%xmm1
+	vextracti32x4	$3,%zmm6,%xmm2
+	vpxord	%xmm0,%xmm6,%xmm10
+	vpternlogd	$0x96,%xmm1,%xmm2,%xmm10
+
+
+L$done__func3:
+
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%r12)
+
+	vzeroupper
+	popq	%r12
+
+	ret
+
+
+
+.globl	_aes_gcm_dec_update_vaes_avx10_512
+.private_extern _aes_gcm_dec_update_vaes_avx10_512
+
+.p2align	5
+_aes_gcm_dec_update_vaes_avx10_512:
+
+
+_CET_ENDBR
+	pushq	%r12
+
+
+	movq	16(%rsp),%r12
+
+	vbroadcasti32x4	L$bswap_mask(%rip),%zmm8
+	vbroadcasti32x4	L$gfpoly(%rip),%zmm31
+
+
+
+	vmovdqu	(%r12),%xmm10
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vbroadcasti32x4	(%r8),%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm12
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti32x4	(%rcx),%zmm13
+	vbroadcasti32x4	(%r11),%zmm14
+
+
+	vpaddd	L$ctr_pattern(%rip),%zmm12,%zmm12
+
+
+	vbroadcasti32x4	L$inc_4blocks(%rip),%zmm11
+
+
+
+	cmpq	$256-1,%rdx
+	jbe	L$crypt_loop_4x_done__func4
+
+
+	vmovdqu8	256-256(%r9),%zmm27
+	vmovdqu8	256-192(%r9),%zmm28
+	vmovdqu8	256-128(%r9),%zmm29
+	vmovdqu8	256-64(%r9),%zmm30
+	vbroadcasti32x4	-144(%r11),%zmm15
+	vbroadcasti32x4	-128(%r11),%zmm16
+	vbroadcasti32x4	-112(%r11),%zmm17
+	vbroadcasti32x4	-96(%r11),%zmm18
+	vbroadcasti32x4	-80(%r11),%zmm19
+	vbroadcasti32x4	-64(%r11),%zmm20
+	vbroadcasti32x4	-48(%r11),%zmm21
+	vbroadcasti32x4	-32(%r11),%zmm22
+	vbroadcasti32x4	-16(%r11),%zmm23
+L$crypt_loop_4x__func4:
+	vmovdqu8	0(%rdi),%zmm4
+	vmovdqu8	64(%rdi),%zmm5
+	vmovdqu8	128(%rdi),%zmm6
+	vmovdqu8	192(%rdi),%zmm7
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm1
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm2
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm3
+	vpaddd	%zmm11,%zmm12,%zmm12
+
+
+	vpxord	%zmm13,%zmm0,%zmm0
+	vpxord	%zmm13,%zmm1,%zmm1
+	vpxord	%zmm13,%zmm2,%zmm2
+	vpxord	%zmm13,%zmm3,%zmm3
+
+	cmpl	$24,%r10d
+	jl	L$aes128__func4
+	je	L$aes192__func4
+
+	vbroadcasti32x4	-208(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+	vbroadcasti32x4	-192(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+L$aes192__func4:
+	vbroadcasti32x4	-176(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+	vbroadcasti32x4	-160(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+L$aes128__func4:
+	vpshufb	%zmm8,%zmm4,%zmm4
+	vpxord	%zmm10,%zmm4,%zmm4
+	vpshufb	%zmm8,%zmm5,%zmm5
+	vpshufb	%zmm8,%zmm6,%zmm6
+
+	vaesenc	%zmm15,%zmm0,%zmm0
+	vaesenc	%zmm15,%zmm1,%zmm1
+	vaesenc	%zmm15,%zmm2,%zmm2
+	vaesenc	%zmm15,%zmm3,%zmm3
+
+	vpshufb	%zmm8,%zmm7,%zmm7
+	vpclmulqdq	$0x00,%zmm27,%zmm4,%zmm10
+	vpclmulqdq	$0x00,%zmm28,%zmm5,%zmm24
+	vpclmulqdq	$0x00,%zmm29,%zmm6,%zmm25
+
+	vaesenc	%zmm16,%zmm0,%zmm0
+	vaesenc	%zmm16,%zmm1,%zmm1
+	vaesenc	%zmm16,%zmm2,%zmm2
+	vaesenc	%zmm16,%zmm3,%zmm3
+
+	vpxord	%zmm24,%zmm10,%zmm10
+	vpclmulqdq	$0x00,%zmm30,%zmm7,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm10
+	vpclmulqdq	$0x01,%zmm27,%zmm4,%zmm24
+
+	vaesenc	%zmm17,%zmm0,%zmm0
+	vaesenc	%zmm17,%zmm1,%zmm1
+	vaesenc	%zmm17,%zmm2,%zmm2
+	vaesenc	%zmm17,%zmm3,%zmm3
+
+	vpclmulqdq	$0x01,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x01,%zmm29,%zmm6,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm30,%zmm7,%zmm25
+
+	vaesenc	%zmm18,%zmm0,%zmm0
+	vaesenc	%zmm18,%zmm1,%zmm1
+	vaesenc	%zmm18,%zmm2,%zmm2
+	vaesenc	%zmm18,%zmm3,%zmm3
+
+	vpclmulqdq	$0x10,%zmm27,%zmm4,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x10,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x10,%zmm29,%zmm6,%zmm26
+
+	vaesenc	%zmm19,%zmm0,%zmm0
+	vaesenc	%zmm19,%zmm1,%zmm1
+	vaesenc	%zmm19,%zmm2,%zmm2
+	vaesenc	%zmm19,%zmm3,%zmm3
+
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm10,%zmm31,%zmm26
+	vpclmulqdq	$0x10,%zmm30,%zmm7,%zmm25
+	vpxord	%zmm25,%zmm24,%zmm24
+
+	vaesenc	%zmm20,%zmm0,%zmm0
+	vaesenc	%zmm20,%zmm1,%zmm1
+	vaesenc	%zmm20,%zmm2,%zmm2
+	vaesenc	%zmm20,%zmm3,%zmm3
+
+	vpshufd	$0x4e,%zmm10,%zmm10
+	vpclmulqdq	$0x11,%zmm27,%zmm4,%zmm4
+	vpclmulqdq	$0x11,%zmm28,%zmm5,%zmm5
+	vpclmulqdq	$0x11,%zmm29,%zmm6,%zmm6
+
+	vaesenc	%zmm21,%zmm0,%zmm0
+	vaesenc	%zmm21,%zmm1,%zmm1
+	vaesenc	%zmm21,%zmm2,%zmm2
+	vaesenc	%zmm21,%zmm3,%zmm3
+
+	vpternlogd	$0x96,%zmm26,%zmm10,%zmm24
+	vpclmulqdq	$0x11,%zmm30,%zmm7,%zmm7
+	vpternlogd	$0x96,%zmm6,%zmm5,%zmm4
+	vpclmulqdq	$0x01,%zmm24,%zmm31,%zmm25
+
+	vaesenc	%zmm22,%zmm0,%zmm0
+	vaesenc	%zmm22,%zmm1,%zmm1
+	vaesenc	%zmm22,%zmm2,%zmm2
+	vaesenc	%zmm22,%zmm3,%zmm3
+
+	vpxord	%zmm7,%zmm4,%zmm10
+	vpshufd	$0x4e,%zmm24,%zmm24
+	vpternlogd	$0x96,%zmm25,%zmm24,%zmm10
+
+	vaesenc	%zmm23,%zmm0,%zmm0
+	vaesenc	%zmm23,%zmm1,%zmm1
+	vaesenc	%zmm23,%zmm2,%zmm2
+	vaesenc	%zmm23,%zmm3,%zmm3
+
+	vextracti32x4	$1,%zmm10,%xmm4
+	vextracti32x4	$2,%zmm10,%xmm5
+	vextracti32x4	$3,%zmm10,%xmm6
+	vpxord	%xmm4,%xmm10,%xmm10
+	vpternlogd	$0x96,%xmm5,%xmm6,%xmm10
+
+
+
+
+	vpxord	0(%rdi),%zmm14,%zmm4
+	vpxord	64(%rdi),%zmm14,%zmm5
+	vpxord	128(%rdi),%zmm14,%zmm6
+	vpxord	192(%rdi),%zmm14,%zmm7
+
+
+
+	vaesenclast	%zmm4,%zmm0,%zmm4
+	vaesenclast	%zmm5,%zmm1,%zmm5
+	vaesenclast	%zmm6,%zmm2,%zmm6
+	vaesenclast	%zmm7,%zmm3,%zmm7
+
+
+	vmovdqu8	%zmm4,0(%rsi)
+	vmovdqu8	%zmm5,64(%rsi)
+	vmovdqu8	%zmm6,128(%rsi)
+	vmovdqu8	%zmm7,192(%rsi)
+
+	subq	$-256,%rdi
+	subq	$-256,%rsi
+	addq	$-256,%rdx
+	cmpq	$256-1,%rdx
+	ja	L$crypt_loop_4x__func4
+L$crypt_loop_4x_done__func4:
+
+	testq	%rdx,%rdx
+	jz	L$done__func4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdx,%rax
+	negq	%rax
+	andq	$-16,%rax
+	leaq	256(%r9,%rax,1),%r8
+	vpxor	%xmm4,%xmm4,%xmm4
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+
+	cmpq	$64,%rdx
+	jb	L$partial_vec__func4
+
+L$crypt_loop_1x__func4:
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpxord	%zmm13,%zmm0,%zmm0
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_full_vec__func4:
+	vbroadcasti32x4	(%rax),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_full_vec__func4
+	vaesenclast	%zmm14,%zmm0,%zmm0
+
+
+	vmovdqu8	(%rdi),%zmm1
+	vpxord	%zmm1,%zmm0,%zmm0
+	vmovdqu8	%zmm0,(%rsi)
+
+
+	vmovdqu8	(%r8),%zmm30
+	vpshufb	%zmm8,%zmm1,%zmm0
+	vpxord	%zmm10,%zmm0,%zmm0
+	vpclmulqdq	$0x00,%zmm30,%zmm0,%zmm7
+	vpclmulqdq	$0x01,%zmm30,%zmm0,%zmm1
+	vpclmulqdq	$0x10,%zmm30,%zmm0,%zmm2
+	vpclmulqdq	$0x11,%zmm30,%zmm0,%zmm3
+	vpxord	%zmm7,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm2,%zmm1,%zmm5
+	vpxord	%zmm3,%zmm6,%zmm6
+
+	vpxor	%xmm10,%xmm10,%xmm10
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	cmpq	$64,%rdx
+	jae	L$crypt_loop_1x__func4
+
+	testq	%rdx,%rdx
+	jz	L$reduce__func4
+
+L$partial_vec__func4:
+
+
+
+
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovq	%rax,%k1
+	addq	$15,%rdx
+	andq	$-16,%rdx
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovq	%rax,%k2
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpxord	%zmm13,%zmm0,%zmm0
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_partialvec__func4:
+	vbroadcasti32x4	(%rax),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_partialvec__func4
+	vaesenclast	%zmm14,%zmm0,%zmm0
+
+
+	vmovdqu8	(%rdi),%zmm1{%k1}{z}
+	vpxord	%zmm1,%zmm0,%zmm0
+	vmovdqu8	%zmm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	(%r8),%zmm30{%k2}{z}
+
+	vpshufb	%zmm8,%zmm1,%zmm0
+	vpxord	%zmm10,%zmm0,%zmm0
+	vpclmulqdq	$0x00,%zmm30,%zmm0,%zmm7
+	vpclmulqdq	$0x01,%zmm30,%zmm0,%zmm1
+	vpclmulqdq	$0x10,%zmm30,%zmm0,%zmm2
+	vpclmulqdq	$0x11,%zmm30,%zmm0,%zmm3
+	vpxord	%zmm7,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm2,%zmm1,%zmm5
+	vpxord	%zmm3,%zmm6,%zmm6
+
+
+L$reduce__func4:
+
+	vpclmulqdq	$0x01,%zmm4,%zmm31,%zmm0
+	vpshufd	$0x4e,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm0,%zmm4,%zmm5
+	vpclmulqdq	$0x01,%zmm5,%zmm31,%zmm0
+	vpshufd	$0x4e,%zmm5,%zmm5
+	vpternlogd	$0x96,%zmm0,%zmm5,%zmm6
+
+	vextracti32x4	$1,%zmm6,%xmm0
+	vextracti32x4	$2,%zmm6,%xmm1
+	vextracti32x4	$3,%zmm6,%xmm2
+	vpxord	%xmm0,%xmm6,%xmm10
+	vpternlogd	$0x96,%xmm1,%xmm2,%xmm10
+
+
+L$done__func4:
+
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%r12)
+
+	vzeroupper
+	popq	%r12
+
+	ret
+
+
+
+#endif
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-linux.S b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
new file mode 100644
index 0000000..cf661c8
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
@@ -0,0 +1,2274 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.section	.rodata
+.align	64
+
+
+.Lbswap_mask:
+.quad	0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+.Lgfpoly:
+.quad	1, 0xc200000000000000
+
+
+.Lgfpoly_and_internal_carrybit:
+.quad	1, 0xc200000000000001
+
+
+
+
+
+.Lctr_pattern:
+.quad	0, 0
+.quad	1, 0
+.Linc_2blocks:
+.quad	2, 0
+.quad	3, 0
+.Linc_4blocks:
+.quad	4, 0
+
+.text	
+.globl	gcm_gmult_vpclmulqdq_avx10
+.hidden gcm_gmult_vpclmulqdq_avx10
+.type	gcm_gmult_vpclmulqdq_avx10,@function
+.align	32
+gcm_gmult_vpclmulqdq_avx10:
+.cfi_startproc	
+
+_CET_ENDBR
+
+
+
+	vmovdqu	(%rdi),%xmm0
+	vmovdqu	.Lbswap_mask(%rip),%xmm1
+	vmovdqu	256-16(%rsi),%xmm2
+	vmovdqu	.Lgfpoly(%rip),%xmm3
+	vpshufb	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm4
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm5
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm6
+	vpxord	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm4,%xmm3,%xmm6
+	vpshufd	$0x4e,%xmm4,%xmm4
+	vpternlogd	$0x96,%xmm6,%xmm4,%xmm5
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x01,%xmm5,%xmm3,%xmm4
+	vpshufd	$0x4e,%xmm5,%xmm5
+	vpternlogd	$0x96,%xmm4,%xmm5,%xmm0
+
+
+	vpshufb	%xmm1,%xmm0,%xmm0
+	vmovdqu	%xmm0,(%rdi)
+	ret
+
+.cfi_endproc	
+.size	gcm_gmult_vpclmulqdq_avx10, . - gcm_gmult_vpclmulqdq_avx10
+.globl	gcm_init_vpclmulqdq_avx10
+.hidden gcm_init_vpclmulqdq_avx10
+.type	gcm_init_vpclmulqdq_avx10,@function
+.align	32
+gcm_init_vpclmulqdq_avx10:
+.cfi_startproc	
+
+_CET_ENDBR
+
+	leaq	256-32(%rdi),%r8
+
+
+
+	vpshufd	$0x4e,(%rsi),%xmm3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vpshufd	$0xd3,%xmm3,%xmm0
+	vpsrad	$31,%xmm0,%xmm0
+	vpaddq	%xmm3,%xmm3,%xmm3
+
+	vpternlogd	$0x78,.Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm3
+
+
+	vbroadcasti32x4	.Lgfpoly(%rip),%ymm5
+
+
+
+
+
+
+
+
+	vpclmulqdq	$0x00,%xmm3,%xmm3,%xmm0
+	vpclmulqdq	$0x01,%xmm3,%xmm3,%xmm1
+	vpclmulqdq	$0x10,%xmm3,%xmm3,%xmm2
+	vpxord	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm5,%xmm2
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpternlogd	$0x96,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm3,%xmm4
+	vpclmulqdq	$0x01,%xmm1,%xmm5,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpternlogd	$0x96,%xmm0,%xmm1,%xmm4
+
+
+
+	vinserti128	$1,%xmm3,%ymm4,%ymm3
+	vinserti128	$1,%xmm4,%ymm4,%ymm4
+
+	vmovdqu8	%ymm3,(%r8)
+
+
+
+
+
+	movl	$7,%eax
+.Lprecompute_next__func1:
+	subq	$32,%r8
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm0
+	vpclmulqdq	$0x01,%ymm4,%ymm3,%ymm1
+	vpclmulqdq	$0x10,%ymm4,%ymm3,%ymm2
+	vpxord	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm5,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpternlogd	$0x96,%ymm2,%ymm0,%ymm1
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm3
+	vpclmulqdq	$0x01,%ymm1,%ymm5,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpternlogd	$0x96,%ymm0,%ymm1,%ymm3
+
+	vmovdqu8	%ymm3,(%r8)
+	decl	%eax
+	jnz	.Lprecompute_next__func1
+
+	vzeroupper
+	ret
+
+.cfi_endproc	
+.size	gcm_init_vpclmulqdq_avx10, . - gcm_init_vpclmulqdq_avx10
+.globl	gcm_ghash_vpclmulqdq_avx10_256
+.hidden gcm_ghash_vpclmulqdq_avx10_256
+.type	gcm_ghash_vpclmulqdq_avx10_256,@function
+.align	32
+gcm_ghash_vpclmulqdq_avx10_256:
+.cfi_startproc	
+
+_CET_ENDBR
+
+
+
+
+
+
+	vmovdqu	.Lbswap_mask(%rip),%xmm4
+	vmovdqu	.Lgfpoly(%rip),%xmm10
+
+
+	vmovdqu	(%rdi),%xmm5
+	vpshufb	%xmm4,%xmm5,%xmm5
+
+
+	cmpq	$32,%rcx
+	jb	.Laad_blockbyblock__func1
+
+
+
+	vshufi64x2	$0,%ymm4,%ymm4,%ymm4
+	vshufi64x2	$0,%ymm10,%ymm10,%ymm10
+
+
+	vmovdqu8	256-32(%rsi),%ymm9
+
+	cmpq	$128-1,%rcx
+	jbe	.Laad_loop_1x__func1
+
+
+	vmovdqu8	256-128(%rsi),%ymm6
+	vmovdqu8	256-96(%rsi),%ymm7
+	vmovdqu8	256-64(%rsi),%ymm8
+
+
+.Laad_loop_4x__func1:
+	vmovdqu8	0(%rdx),%ymm0
+	vmovdqu8	32(%rdx),%ymm1
+	vmovdqu8	64(%rdx),%ymm2
+	vmovdqu8	96(%rdx),%ymm3
+	vpshufb	%ymm4,%ymm0,%ymm0
+	vpxord	%ymm5,%ymm0,%ymm0
+	vpshufb	%ymm4,%ymm1,%ymm1
+	vpshufb	%ymm4,%ymm2,%ymm2
+	vpshufb	%ymm4,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm6,%ymm0,%ymm5
+	vpclmulqdq	$0x00,%ymm7,%ymm1,%ymm11
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm12
+	vpxord	%ymm11,%ymm5,%ymm5
+	vpclmulqdq	$0x00,%ymm9,%ymm3,%ymm13
+	vpternlogd	$0x96,%ymm13,%ymm12,%ymm5
+	vpclmulqdq	$0x01,%ymm6,%ymm0,%ymm11
+	vpclmulqdq	$0x01,%ymm7,%ymm1,%ymm12
+	vpclmulqdq	$0x01,%ymm8,%ymm2,%ymm13
+	vpternlogd	$0x96,%ymm13,%ymm12,%ymm11
+	vpclmulqdq	$0x01,%ymm9,%ymm3,%ymm12
+	vpclmulqdq	$0x10,%ymm6,%ymm0,%ymm13
+	vpternlogd	$0x96,%ymm13,%ymm12,%ymm11
+	vpclmulqdq	$0x10,%ymm7,%ymm1,%ymm12
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm13
+	vpternlogd	$0x96,%ymm13,%ymm12,%ymm11
+	vpclmulqdq	$0x01,%ymm5,%ymm10,%ymm13
+	vpclmulqdq	$0x10,%ymm9,%ymm3,%ymm12
+	vpxord	%ymm12,%ymm11,%ymm11
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm6,%ymm0,%ymm0
+	vpclmulqdq	$0x11,%ymm7,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm8,%ymm2,%ymm2
+	vpternlogd	$0x96,%ymm13,%ymm5,%ymm11
+	vpclmulqdq	$0x11,%ymm9,%ymm3,%ymm3
+	vpternlogd	$0x96,%ymm2,%ymm1,%ymm0
+	vpclmulqdq	$0x01,%ymm11,%ymm10,%ymm12
+	vpxord	%ymm3,%ymm0,%ymm5
+	vpshufd	$0x4e,%ymm11,%ymm11
+	vpternlogd	$0x96,%ymm12,%ymm11,%ymm5
+	vextracti32x4	$1,%ymm5,%xmm0
+	vpxord	%xmm0,%xmm5,%xmm5
+
+	subq	$-128,%rdx
+	addq	$-128,%rcx
+	cmpq	$128-1,%rcx
+	ja	.Laad_loop_4x__func1
+
+
+	cmpq	$32,%rcx
+	jb	.Laad_large_done__func1
+.Laad_loop_1x__func1:
+	vmovdqu8	(%rdx),%ymm0
+	vpshufb	%ymm4,%ymm0,%ymm0
+	vpxord	%ymm0,%ymm5,%ymm5
+	vpclmulqdq	$0x00,%ymm9,%ymm5,%ymm0
+	vpclmulqdq	$0x01,%ymm9,%ymm5,%ymm1
+	vpclmulqdq	$0x10,%ymm9,%ymm5,%ymm2
+	vpxord	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm10,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpternlogd	$0x96,%ymm2,%ymm0,%ymm1
+	vpclmulqdq	$0x11,%ymm9,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm1,%ymm10,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpternlogd	$0x96,%ymm0,%ymm1,%ymm5
+
+	vextracti32x4	$1,%ymm5,%xmm0
+	vpxord	%xmm0,%xmm5,%xmm5
+
+	addq	$32,%rdx
+	subq	$32,%rcx
+	cmpq	$32,%rcx
+	jae	.Laad_loop_1x__func1
+
+.Laad_large_done__func1:
+
+
+	vzeroupper
+
+
+.Laad_blockbyblock__func1:
+	testq	%rcx,%rcx
+	jz	.Laad_done__func1
+	vmovdqu	256-16(%rsi),%xmm9
+.Laad_loop_blockbyblock__func1:
+	vmovdqu	(%rdx),%xmm0
+	vpshufb	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm9,%xmm5,%xmm0
+	vpclmulqdq	$0x01,%xmm9,%xmm5,%xmm1
+	vpclmulqdq	$0x10,%xmm9,%xmm5,%xmm2
+	vpxord	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm10,%xmm2
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpternlogd	$0x96,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x11,%xmm9,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm1,%xmm10,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpternlogd	$0x96,%xmm0,%xmm1,%xmm5
+
+	addq	$16,%rdx
+	subq	$16,%rcx
+	jnz	.Laad_loop_blockbyblock__func1
+
+.Laad_done__func1:
+
+	vpshufb	%xmm4,%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rdi)
+	ret
+
+.cfi_endproc	
+.size	gcm_ghash_vpclmulqdq_avx10_256, . - gcm_ghash_vpclmulqdq_avx10_256
+.globl	aes_gcm_enc_update_vaes_avx10_256
+.hidden aes_gcm_enc_update_vaes_avx10_256
+.type	aes_gcm_enc_update_vaes_avx10_256,@function
+.align	32
+aes_gcm_enc_update_vaes_avx10_256:
+.cfi_startproc	
+
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+6(%rip)
+#endif
+
+	vbroadcasti32x4	.Lbswap_mask(%rip),%ymm8
+	vbroadcasti32x4	.Lgfpoly(%rip),%ymm31
+
+
+
+	vmovdqu	(%r12),%xmm10
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vbroadcasti32x4	(%r8),%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm12
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti32x4	(%rcx),%ymm13
+	vbroadcasti32x4	(%r11),%ymm14
+
+
+	vpaddd	.Lctr_pattern(%rip),%ymm12,%ymm12
+
+
+	vbroadcasti32x4	.Linc_2blocks(%rip),%ymm11
+
+
+
+	cmpq	$128-1,%rdx
+	jbe	.Lcrypt_loop_4x_done__func1
+
+
+	vmovdqu8	256-128(%r9),%ymm27
+	vmovdqu8	256-96(%r9),%ymm28
+	vmovdqu8	256-64(%r9),%ymm29
+	vmovdqu8	256-32(%r9),%ymm30
+
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm1
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm2
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm3
+	vpaddd	%ymm11,%ymm12,%ymm12
+
+
+	vpxord	%ymm13,%ymm0,%ymm0
+	vpxord	%ymm13,%ymm1,%ymm1
+	vpxord	%ymm13,%ymm2,%ymm2
+	vpxord	%ymm13,%ymm3,%ymm3
+
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_first_4_vecs__func1:
+	vbroadcasti32x4	(%rax),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_first_4_vecs__func1
+
+
+
+	vpxord	0(%rdi),%ymm14,%ymm4
+	vpxord	32(%rdi),%ymm14,%ymm5
+	vpxord	64(%rdi),%ymm14,%ymm6
+	vpxord	96(%rdi),%ymm14,%ymm7
+
+
+
+	vaesenclast	%ymm4,%ymm0,%ymm4
+	vaesenclast	%ymm5,%ymm1,%ymm5
+	vaesenclast	%ymm6,%ymm2,%ymm6
+	vaesenclast	%ymm7,%ymm3,%ymm7
+
+
+	vmovdqu8	%ymm4,0(%rsi)
+	vmovdqu8	%ymm5,32(%rsi)
+	vmovdqu8	%ymm6,64(%rsi)
+	vmovdqu8	%ymm7,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$128-1,%rdx
+	jbe	.Lghash_last_ciphertext_4x__func1
+	vbroadcasti32x4	-144(%r11),%ymm15
+	vbroadcasti32x4	-128(%r11),%ymm16
+	vbroadcasti32x4	-112(%r11),%ymm17
+	vbroadcasti32x4	-96(%r11),%ymm18
+	vbroadcasti32x4	-80(%r11),%ymm19
+	vbroadcasti32x4	-64(%r11),%ymm20
+	vbroadcasti32x4	-48(%r11),%ymm21
+	vbroadcasti32x4	-32(%r11),%ymm22
+	vbroadcasti32x4	-16(%r11),%ymm23
+.Lcrypt_loop_4x__func1:
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm1
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm2
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm3
+	vpaddd	%ymm11,%ymm12,%ymm12
+
+
+	vpxord	%ymm13,%ymm0,%ymm0
+	vpxord	%ymm13,%ymm1,%ymm1
+	vpxord	%ymm13,%ymm2,%ymm2
+	vpxord	%ymm13,%ymm3,%ymm3
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func1
+	je	.Laes192__func1
+
+	vbroadcasti32x4	-208(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+	vbroadcasti32x4	-192(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+.Laes192__func1:
+	vbroadcasti32x4	-176(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+	vbroadcasti32x4	-160(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+.Laes128__func1:
+	vpshufb	%ymm8,%ymm4,%ymm4
+	vpxord	%ymm10,%ymm4,%ymm4
+	vpshufb	%ymm8,%ymm5,%ymm5
+	vpshufb	%ymm8,%ymm6,%ymm6
+
+	vaesenc	%ymm15,%ymm0,%ymm0
+	vaesenc	%ymm15,%ymm1,%ymm1
+	vaesenc	%ymm15,%ymm2,%ymm2
+	vaesenc	%ymm15,%ymm3,%ymm3
+
+	vpshufb	%ymm8,%ymm7,%ymm7
+	vpclmulqdq	$0x00,%ymm27,%ymm4,%ymm10
+	vpclmulqdq	$0x00,%ymm28,%ymm5,%ymm24
+	vpclmulqdq	$0x00,%ymm29,%ymm6,%ymm25
+
+	vaesenc	%ymm16,%ymm0,%ymm0
+	vaesenc	%ymm16,%ymm1,%ymm1
+	vaesenc	%ymm16,%ymm2,%ymm2
+	vaesenc	%ymm16,%ymm3,%ymm3
+
+	vpxord	%ymm24,%ymm10,%ymm10
+	vpclmulqdq	$0x00,%ymm30,%ymm7,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm10
+	vpclmulqdq	$0x01,%ymm27,%ymm4,%ymm24
+
+	vaesenc	%ymm17,%ymm0,%ymm0
+	vaesenc	%ymm17,%ymm1,%ymm1
+	vaesenc	%ymm17,%ymm2,%ymm2
+	vaesenc	%ymm17,%ymm3,%ymm3
+
+	vpclmulqdq	$0x01,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x01,%ymm29,%ymm6,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm30,%ymm7,%ymm25
+
+	vaesenc	%ymm18,%ymm0,%ymm0
+	vaesenc	%ymm18,%ymm1,%ymm1
+	vaesenc	%ymm18,%ymm2,%ymm2
+	vaesenc	%ymm18,%ymm3,%ymm3
+
+	vpclmulqdq	$0x10,%ymm27,%ymm4,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x10,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x10,%ymm29,%ymm6,%ymm26
+
+	vaesenc	%ymm19,%ymm0,%ymm0
+	vaesenc	%ymm19,%ymm1,%ymm1
+	vaesenc	%ymm19,%ymm2,%ymm2
+	vaesenc	%ymm19,%ymm3,%ymm3
+
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm10,%ymm31,%ymm26
+	vpclmulqdq	$0x10,%ymm30,%ymm7,%ymm25
+	vpxord	%ymm25,%ymm24,%ymm24
+
+	vaesenc	%ymm20,%ymm0,%ymm0
+	vaesenc	%ymm20,%ymm1,%ymm1
+	vaesenc	%ymm20,%ymm2,%ymm2
+	vaesenc	%ymm20,%ymm3,%ymm3
+
+	vpshufd	$0x4e,%ymm10,%ymm10
+	vpclmulqdq	$0x11,%ymm27,%ymm4,%ymm4
+	vpclmulqdq	$0x11,%ymm28,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm29,%ymm6,%ymm6
+
+	vaesenc	%ymm21,%ymm0,%ymm0
+	vaesenc	%ymm21,%ymm1,%ymm1
+	vaesenc	%ymm21,%ymm2,%ymm2
+	vaesenc	%ymm21,%ymm3,%ymm3
+
+	vpternlogd	$0x96,%ymm26,%ymm10,%ymm24
+	vpclmulqdq	$0x11,%ymm30,%ymm7,%ymm7
+	vpternlogd	$0x96,%ymm6,%ymm5,%ymm4
+	vpclmulqdq	$0x01,%ymm24,%ymm31,%ymm25
+
+	vaesenc	%ymm22,%ymm0,%ymm0
+	vaesenc	%ymm22,%ymm1,%ymm1
+	vaesenc	%ymm22,%ymm2,%ymm2
+	vaesenc	%ymm22,%ymm3,%ymm3
+
+	vpxord	%ymm7,%ymm4,%ymm10
+	vpshufd	$0x4e,%ymm24,%ymm24
+	vpternlogd	$0x96,%ymm25,%ymm24,%ymm10
+
+	vaesenc	%ymm23,%ymm0,%ymm0
+	vaesenc	%ymm23,%ymm1,%ymm1
+	vaesenc	%ymm23,%ymm2,%ymm2
+	vaesenc	%ymm23,%ymm3,%ymm3
+
+	vextracti32x4	$1,%ymm10,%xmm4
+	vpxord	%xmm4,%xmm10,%xmm10
+
+
+
+
+	vpxord	0(%rdi),%ymm14,%ymm4
+	vpxord	32(%rdi),%ymm14,%ymm5
+	vpxord	64(%rdi),%ymm14,%ymm6
+	vpxord	96(%rdi),%ymm14,%ymm7
+
+
+
+	vaesenclast	%ymm4,%ymm0,%ymm4
+	vaesenclast	%ymm5,%ymm1,%ymm5
+	vaesenclast	%ymm6,%ymm2,%ymm6
+	vaesenclast	%ymm7,%ymm3,%ymm7
+
+
+	vmovdqu8	%ymm4,0(%rsi)
+	vmovdqu8	%ymm5,32(%rsi)
+	vmovdqu8	%ymm6,64(%rsi)
+	vmovdqu8	%ymm7,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$128-1,%rdx
+	ja	.Lcrypt_loop_4x__func1
+.Lghash_last_ciphertext_4x__func1:
+	vpshufb	%ymm8,%ymm4,%ymm4
+	vpxord	%ymm10,%ymm4,%ymm4
+	vpshufb	%ymm8,%ymm5,%ymm5
+	vpshufb	%ymm8,%ymm6,%ymm6
+	vpshufb	%ymm8,%ymm7,%ymm7
+	vpclmulqdq	$0x00,%ymm27,%ymm4,%ymm10
+	vpclmulqdq	$0x00,%ymm28,%ymm5,%ymm24
+	vpclmulqdq	$0x00,%ymm29,%ymm6,%ymm25
+	vpxord	%ymm24,%ymm10,%ymm10
+	vpclmulqdq	$0x00,%ymm30,%ymm7,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm10
+	vpclmulqdq	$0x01,%ymm27,%ymm4,%ymm24
+	vpclmulqdq	$0x01,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x01,%ymm29,%ymm6,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm30,%ymm7,%ymm25
+	vpclmulqdq	$0x10,%ymm27,%ymm4,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x10,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x10,%ymm29,%ymm6,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm10,%ymm31,%ymm26
+	vpclmulqdq	$0x10,%ymm30,%ymm7,%ymm25
+	vpxord	%ymm25,%ymm24,%ymm24
+	vpshufd	$0x4e,%ymm10,%ymm10
+	vpclmulqdq	$0x11,%ymm27,%ymm4,%ymm4
+	vpclmulqdq	$0x11,%ymm28,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm29,%ymm6,%ymm6
+	vpternlogd	$0x96,%ymm26,%ymm10,%ymm24
+	vpclmulqdq	$0x11,%ymm30,%ymm7,%ymm7
+	vpternlogd	$0x96,%ymm6,%ymm5,%ymm4
+	vpclmulqdq	$0x01,%ymm24,%ymm31,%ymm25
+	vpxord	%ymm7,%ymm4,%ymm10
+	vpshufd	$0x4e,%ymm24,%ymm24
+	vpternlogd	$0x96,%ymm25,%ymm24,%ymm10
+	vextracti32x4	$1,%ymm10,%xmm4
+	vpxord	%xmm4,%xmm10,%xmm10
+
+.Lcrypt_loop_4x_done__func1:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdx,%rax
+	negq	%rax
+	andq	$-16,%rax
+	leaq	256(%r9,%rax,1),%r8
+	vpxor	%xmm4,%xmm4,%xmm4
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+
+	cmpq	$32,%rdx
+	jb	.Lpartial_vec__func1
+
+.Lcrypt_loop_1x__func1:
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpxord	%ymm13,%ymm0,%ymm0
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_full_vec__func1:
+	vbroadcasti32x4	(%rax),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_full_vec__func1
+	vaesenclast	%ymm14,%ymm0,%ymm0
+
+
+	vmovdqu8	(%rdi),%ymm1
+	vpxord	%ymm1,%ymm0,%ymm0
+	vmovdqu8	%ymm0,(%rsi)
+
+
+	vmovdqu8	(%r8),%ymm30
+	vpshufb	%ymm8,%ymm0,%ymm0
+	vpxord	%ymm10,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm30,%ymm0,%ymm7
+	vpclmulqdq	$0x01,%ymm30,%ymm0,%ymm1
+	vpclmulqdq	$0x10,%ymm30,%ymm0,%ymm2
+	vpclmulqdq	$0x11,%ymm30,%ymm0,%ymm3
+	vpxord	%ymm7,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm2,%ymm1,%ymm5
+	vpxord	%ymm3,%ymm6,%ymm6
+
+	vpxor	%xmm10,%xmm10,%xmm10
+
+	addq	$32,%r8
+	addq	$32,%rdi
+	addq	$32,%rsi
+	subq	$32,%rdx
+	cmpq	$32,%rdx
+	jae	.Lcrypt_loop_1x__func1
+
+	testq	%rdx,%rdx
+	jz	.Lreduce__func1
+
+.Lpartial_vec__func1:
+
+
+
+
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovd	%eax,%k1
+	addq	$15,%rdx
+	andq	$-16,%rdx
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovd	%eax,%k2
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpxord	%ymm13,%ymm0,%ymm0
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_partialvec__func1:
+	vbroadcasti32x4	(%rax),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_partialvec__func1
+	vaesenclast	%ymm14,%ymm0,%ymm0
+
+
+	vmovdqu8	(%rdi),%ymm1{%k1}{z}
+	vpxord	%ymm1,%ymm0,%ymm0
+	vmovdqu8	%ymm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	(%r8),%ymm30{%k2}{z}
+	vmovdqu8	%ymm0,%ymm1{%k1}{z}
+	vpshufb	%ymm8,%ymm1,%ymm0
+	vpxord	%ymm10,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm30,%ymm0,%ymm7
+	vpclmulqdq	$0x01,%ymm30,%ymm0,%ymm1
+	vpclmulqdq	$0x10,%ymm30,%ymm0,%ymm2
+	vpclmulqdq	$0x11,%ymm30,%ymm0,%ymm3
+	vpxord	%ymm7,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm2,%ymm1,%ymm5
+	vpxord	%ymm3,%ymm6,%ymm6
+
+
+.Lreduce__func1:
+
+	vpclmulqdq	$0x01,%ymm4,%ymm31,%ymm0
+	vpshufd	$0x4e,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm0,%ymm4,%ymm5
+	vpclmulqdq	$0x01,%ymm5,%ymm31,%ymm0
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpternlogd	$0x96,%ymm0,%ymm5,%ymm6
+
+	vextracti32x4	$1,%ymm6,%xmm0
+	vpxord	%xmm0,%xmm6,%xmm10
+
+
+.Ldone__func1:
+
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+
+.cfi_endproc	
+.size	aes_gcm_enc_update_vaes_avx10_256, . - aes_gcm_enc_update_vaes_avx10_256
+.globl	aes_gcm_dec_update_vaes_avx10_256
+.hidden aes_gcm_dec_update_vaes_avx10_256
+.type	aes_gcm_dec_update_vaes_avx10_256,@function
+.align	32
+aes_gcm_dec_update_vaes_avx10_256:
+.cfi_startproc	
+
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+
+	vbroadcasti32x4	.Lbswap_mask(%rip),%ymm8
+	vbroadcasti32x4	.Lgfpoly(%rip),%ymm31
+
+
+
+	vmovdqu	(%r12),%xmm10
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vbroadcasti32x4	(%r8),%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm12
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti32x4	(%rcx),%ymm13
+	vbroadcasti32x4	(%r11),%ymm14
+
+
+	vpaddd	.Lctr_pattern(%rip),%ymm12,%ymm12
+
+
+	vbroadcasti32x4	.Linc_2blocks(%rip),%ymm11
+
+
+
+	cmpq	$128-1,%rdx
+	jbe	.Lcrypt_loop_4x_done__func2
+
+
+	vmovdqu8	256-128(%r9),%ymm27
+	vmovdqu8	256-96(%r9),%ymm28
+	vmovdqu8	256-64(%r9),%ymm29
+	vmovdqu8	256-32(%r9),%ymm30
+	vbroadcasti32x4	-144(%r11),%ymm15
+	vbroadcasti32x4	-128(%r11),%ymm16
+	vbroadcasti32x4	-112(%r11),%ymm17
+	vbroadcasti32x4	-96(%r11),%ymm18
+	vbroadcasti32x4	-80(%r11),%ymm19
+	vbroadcasti32x4	-64(%r11),%ymm20
+	vbroadcasti32x4	-48(%r11),%ymm21
+	vbroadcasti32x4	-32(%r11),%ymm22
+	vbroadcasti32x4	-16(%r11),%ymm23
+.Lcrypt_loop_4x__func2:
+	vmovdqu8	0(%rdi),%ymm4
+	vmovdqu8	32(%rdi),%ymm5
+	vmovdqu8	64(%rdi),%ymm6
+	vmovdqu8	96(%rdi),%ymm7
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm1
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm2
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm12,%ymm3
+	vpaddd	%ymm11,%ymm12,%ymm12
+
+
+	vpxord	%ymm13,%ymm0,%ymm0
+	vpxord	%ymm13,%ymm1,%ymm1
+	vpxord	%ymm13,%ymm2,%ymm2
+	vpxord	%ymm13,%ymm3,%ymm3
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func2
+	je	.Laes192__func2
+
+	vbroadcasti32x4	-208(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+	vbroadcasti32x4	-192(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+.Laes192__func2:
+	vbroadcasti32x4	-176(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+	vbroadcasti32x4	-160(%r11),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	vaesenc	%ymm9,%ymm1,%ymm1
+	vaesenc	%ymm9,%ymm2,%ymm2
+	vaesenc	%ymm9,%ymm3,%ymm3
+
+.Laes128__func2:
+	vpshufb	%ymm8,%ymm4,%ymm4
+	vpxord	%ymm10,%ymm4,%ymm4
+	vpshufb	%ymm8,%ymm5,%ymm5
+	vpshufb	%ymm8,%ymm6,%ymm6
+
+	vaesenc	%ymm15,%ymm0,%ymm0
+	vaesenc	%ymm15,%ymm1,%ymm1
+	vaesenc	%ymm15,%ymm2,%ymm2
+	vaesenc	%ymm15,%ymm3,%ymm3
+
+	vpshufb	%ymm8,%ymm7,%ymm7
+	vpclmulqdq	$0x00,%ymm27,%ymm4,%ymm10
+	vpclmulqdq	$0x00,%ymm28,%ymm5,%ymm24
+	vpclmulqdq	$0x00,%ymm29,%ymm6,%ymm25
+
+	vaesenc	%ymm16,%ymm0,%ymm0
+	vaesenc	%ymm16,%ymm1,%ymm1
+	vaesenc	%ymm16,%ymm2,%ymm2
+	vaesenc	%ymm16,%ymm3,%ymm3
+
+	vpxord	%ymm24,%ymm10,%ymm10
+	vpclmulqdq	$0x00,%ymm30,%ymm7,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm10
+	vpclmulqdq	$0x01,%ymm27,%ymm4,%ymm24
+
+	vaesenc	%ymm17,%ymm0,%ymm0
+	vaesenc	%ymm17,%ymm1,%ymm1
+	vaesenc	%ymm17,%ymm2,%ymm2
+	vaesenc	%ymm17,%ymm3,%ymm3
+
+	vpclmulqdq	$0x01,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x01,%ymm29,%ymm6,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm30,%ymm7,%ymm25
+
+	vaesenc	%ymm18,%ymm0,%ymm0
+	vaesenc	%ymm18,%ymm1,%ymm1
+	vaesenc	%ymm18,%ymm2,%ymm2
+	vaesenc	%ymm18,%ymm3,%ymm3
+
+	vpclmulqdq	$0x10,%ymm27,%ymm4,%ymm26
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x10,%ymm28,%ymm5,%ymm25
+	vpclmulqdq	$0x10,%ymm29,%ymm6,%ymm26
+
+	vaesenc	%ymm19,%ymm0,%ymm0
+	vaesenc	%ymm19,%ymm1,%ymm1
+	vaesenc	%ymm19,%ymm2,%ymm2
+	vaesenc	%ymm19,%ymm3,%ymm3
+
+	vpternlogd	$0x96,%ymm26,%ymm25,%ymm24
+	vpclmulqdq	$0x01,%ymm10,%ymm31,%ymm26
+	vpclmulqdq	$0x10,%ymm30,%ymm7,%ymm25
+	vpxord	%ymm25,%ymm24,%ymm24
+
+	vaesenc	%ymm20,%ymm0,%ymm0
+	vaesenc	%ymm20,%ymm1,%ymm1
+	vaesenc	%ymm20,%ymm2,%ymm2
+	vaesenc	%ymm20,%ymm3,%ymm3
+
+	vpshufd	$0x4e,%ymm10,%ymm10
+	vpclmulqdq	$0x11,%ymm27,%ymm4,%ymm4
+	vpclmulqdq	$0x11,%ymm28,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm29,%ymm6,%ymm6
+
+	vaesenc	%ymm21,%ymm0,%ymm0
+	vaesenc	%ymm21,%ymm1,%ymm1
+	vaesenc	%ymm21,%ymm2,%ymm2
+	vaesenc	%ymm21,%ymm3,%ymm3
+
+	vpternlogd	$0x96,%ymm26,%ymm10,%ymm24
+	vpclmulqdq	$0x11,%ymm30,%ymm7,%ymm7
+	vpternlogd	$0x96,%ymm6,%ymm5,%ymm4
+	vpclmulqdq	$0x01,%ymm24,%ymm31,%ymm25
+
+	vaesenc	%ymm22,%ymm0,%ymm0
+	vaesenc	%ymm22,%ymm1,%ymm1
+	vaesenc	%ymm22,%ymm2,%ymm2
+	vaesenc	%ymm22,%ymm3,%ymm3
+
+	vpxord	%ymm7,%ymm4,%ymm10
+	vpshufd	$0x4e,%ymm24,%ymm24
+	vpternlogd	$0x96,%ymm25,%ymm24,%ymm10
+
+	vaesenc	%ymm23,%ymm0,%ymm0
+	vaesenc	%ymm23,%ymm1,%ymm1
+	vaesenc	%ymm23,%ymm2,%ymm2
+	vaesenc	%ymm23,%ymm3,%ymm3
+
+	vextracti32x4	$1,%ymm10,%xmm4
+	vpxord	%xmm4,%xmm10,%xmm10
+
+
+
+
+	vpxord	0(%rdi),%ymm14,%ymm4
+	vpxord	32(%rdi),%ymm14,%ymm5
+	vpxord	64(%rdi),%ymm14,%ymm6
+	vpxord	96(%rdi),%ymm14,%ymm7
+
+
+
+	vaesenclast	%ymm4,%ymm0,%ymm4
+	vaesenclast	%ymm5,%ymm1,%ymm5
+	vaesenclast	%ymm6,%ymm2,%ymm6
+	vaesenclast	%ymm7,%ymm3,%ymm7
+
+
+	vmovdqu8	%ymm4,0(%rsi)
+	vmovdqu8	%ymm5,32(%rsi)
+	vmovdqu8	%ymm6,64(%rsi)
+	vmovdqu8	%ymm7,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$128-1,%rdx
+	ja	.Lcrypt_loop_4x__func2
+.Lcrypt_loop_4x_done__func2:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdx,%rax
+	negq	%rax
+	andq	$-16,%rax
+	leaq	256(%r9,%rax,1),%r8
+	vpxor	%xmm4,%xmm4,%xmm4
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+
+	cmpq	$32,%rdx
+	jb	.Lpartial_vec__func2
+
+.Lcrypt_loop_1x__func2:
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpxord	%ymm13,%ymm0,%ymm0
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_full_vec__func2:
+	vbroadcasti32x4	(%rax),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_full_vec__func2
+	vaesenclast	%ymm14,%ymm0,%ymm0
+
+
+	vmovdqu8	(%rdi),%ymm1
+	vpxord	%ymm1,%ymm0,%ymm0
+	vmovdqu8	%ymm0,(%rsi)
+
+
+	vmovdqu8	(%r8),%ymm30
+	vpshufb	%ymm8,%ymm1,%ymm0
+	vpxord	%ymm10,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm30,%ymm0,%ymm7
+	vpclmulqdq	$0x01,%ymm30,%ymm0,%ymm1
+	vpclmulqdq	$0x10,%ymm30,%ymm0,%ymm2
+	vpclmulqdq	$0x11,%ymm30,%ymm0,%ymm3
+	vpxord	%ymm7,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm2,%ymm1,%ymm5
+	vpxord	%ymm3,%ymm6,%ymm6
+
+	vpxor	%xmm10,%xmm10,%xmm10
+
+	addq	$32,%r8
+	addq	$32,%rdi
+	addq	$32,%rsi
+	subq	$32,%rdx
+	cmpq	$32,%rdx
+	jae	.Lcrypt_loop_1x__func2
+
+	testq	%rdx,%rdx
+	jz	.Lreduce__func2
+
+.Lpartial_vec__func2:
+
+
+
+
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovd	%eax,%k1
+	addq	$15,%rdx
+	andq	$-16,%rdx
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovd	%eax,%k2
+
+
+
+	vpshufb	%ymm8,%ymm12,%ymm0
+	vpxord	%ymm13,%ymm0,%ymm0
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_partialvec__func2:
+	vbroadcasti32x4	(%rax),%ymm9
+	vaesenc	%ymm9,%ymm0,%ymm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_partialvec__func2
+	vaesenclast	%ymm14,%ymm0,%ymm0
+
+
+	vmovdqu8	(%rdi),%ymm1{%k1}{z}
+	vpxord	%ymm1,%ymm0,%ymm0
+	vmovdqu8	%ymm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	(%r8),%ymm30{%k2}{z}
+
+	vpshufb	%ymm8,%ymm1,%ymm0
+	vpxord	%ymm10,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm30,%ymm0,%ymm7
+	vpclmulqdq	$0x01,%ymm30,%ymm0,%ymm1
+	vpclmulqdq	$0x10,%ymm30,%ymm0,%ymm2
+	vpclmulqdq	$0x11,%ymm30,%ymm0,%ymm3
+	vpxord	%ymm7,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm2,%ymm1,%ymm5
+	vpxord	%ymm3,%ymm6,%ymm6
+
+
+.Lreduce__func2:
+
+	vpclmulqdq	$0x01,%ymm4,%ymm31,%ymm0
+	vpshufd	$0x4e,%ymm4,%ymm4
+	vpternlogd	$0x96,%ymm0,%ymm4,%ymm5
+	vpclmulqdq	$0x01,%ymm5,%ymm31,%ymm0
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpternlogd	$0x96,%ymm0,%ymm5,%ymm6
+
+	vextracti32x4	$1,%ymm6,%xmm0
+	vpxord	%xmm0,%xmm6,%xmm10
+
+
+.Ldone__func2:
+
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+
+.cfi_endproc	
+.size	aes_gcm_dec_update_vaes_avx10_256, . - aes_gcm_dec_update_vaes_avx10_256
+.globl	gcm_ghash_vpclmulqdq_avx10_512
+.hidden gcm_ghash_vpclmulqdq_avx10_512
+.type	gcm_ghash_vpclmulqdq_avx10_512,@function
+.align	32
+gcm_ghash_vpclmulqdq_avx10_512:
+.cfi_startproc	
+
+_CET_ENDBR
+
+
+
+
+
+
+	vmovdqu	.Lbswap_mask(%rip),%xmm4
+	vmovdqu	.Lgfpoly(%rip),%xmm10
+
+
+	vmovdqu	(%rdi),%xmm5
+	vpshufb	%xmm4,%xmm5,%xmm5
+
+
+	cmpq	$64,%rcx
+	jb	.Laad_blockbyblock__func2
+
+
+
+	vshufi64x2	$0,%zmm4,%zmm4,%zmm4
+	vshufi64x2	$0,%zmm10,%zmm10,%zmm10
+
+
+	vmovdqu8	256-64(%rsi),%zmm9
+
+	cmpq	$256-1,%rcx
+	jbe	.Laad_loop_1x__func2
+
+
+	vmovdqu8	256-256(%rsi),%zmm6
+	vmovdqu8	256-192(%rsi),%zmm7
+	vmovdqu8	256-128(%rsi),%zmm8
+
+
+.Laad_loop_4x__func2:
+	vmovdqu8	0(%rdx),%zmm0
+	vmovdqu8	64(%rdx),%zmm1
+	vmovdqu8	128(%rdx),%zmm2
+	vmovdqu8	192(%rdx),%zmm3
+	vpshufb	%zmm4,%zmm0,%zmm0
+	vpxord	%zmm5,%zmm0,%zmm0
+	vpshufb	%zmm4,%zmm1,%zmm1
+	vpshufb	%zmm4,%zmm2,%zmm2
+	vpshufb	%zmm4,%zmm3,%zmm3
+	vpclmulqdq	$0x00,%zmm6,%zmm0,%zmm5
+	vpclmulqdq	$0x00,%zmm7,%zmm1,%zmm11
+	vpclmulqdq	$0x00,%zmm8,%zmm2,%zmm12
+	vpxord	%zmm11,%zmm5,%zmm5
+	vpclmulqdq	$0x00,%zmm9,%zmm3,%zmm13
+	vpternlogd	$0x96,%zmm13,%zmm12,%zmm5
+	vpclmulqdq	$0x01,%zmm6,%zmm0,%zmm11
+	vpclmulqdq	$0x01,%zmm7,%zmm1,%zmm12
+	vpclmulqdq	$0x01,%zmm8,%zmm2,%zmm13
+	vpternlogd	$0x96,%zmm13,%zmm12,%zmm11
+	vpclmulqdq	$0x01,%zmm9,%zmm3,%zmm12
+	vpclmulqdq	$0x10,%zmm6,%zmm0,%zmm13
+	vpternlogd	$0x96,%zmm13,%zmm12,%zmm11
+	vpclmulqdq	$0x10,%zmm7,%zmm1,%zmm12
+	vpclmulqdq	$0x10,%zmm8,%zmm2,%zmm13
+	vpternlogd	$0x96,%zmm13,%zmm12,%zmm11
+	vpclmulqdq	$0x01,%zmm5,%zmm10,%zmm13
+	vpclmulqdq	$0x10,%zmm9,%zmm3,%zmm12
+	vpxord	%zmm12,%zmm11,%zmm11
+	vpshufd	$0x4e,%zmm5,%zmm5
+	vpclmulqdq	$0x11,%zmm6,%zmm0,%zmm0
+	vpclmulqdq	$0x11,%zmm7,%zmm1,%zmm1
+	vpclmulqdq	$0x11,%zmm8,%zmm2,%zmm2
+	vpternlogd	$0x96,%zmm13,%zmm5,%zmm11
+	vpclmulqdq	$0x11,%zmm9,%zmm3,%zmm3
+	vpternlogd	$0x96,%zmm2,%zmm1,%zmm0
+	vpclmulqdq	$0x01,%zmm11,%zmm10,%zmm12
+	vpxord	%zmm3,%zmm0,%zmm5
+	vpshufd	$0x4e,%zmm11,%zmm11
+	vpternlogd	$0x96,%zmm12,%zmm11,%zmm5
+	vextracti32x4	$1,%zmm5,%xmm0
+	vextracti32x4	$2,%zmm5,%xmm1
+	vextracti32x4	$3,%zmm5,%xmm2
+	vpxord	%xmm0,%xmm5,%xmm5
+	vpternlogd	$0x96,%xmm1,%xmm2,%xmm5
+
+	subq	$-256,%rdx
+	addq	$-256,%rcx
+	cmpq	$256-1,%rcx
+	ja	.Laad_loop_4x__func2
+
+
+	cmpq	$64,%rcx
+	jb	.Laad_large_done__func2
+.Laad_loop_1x__func2:
+	vmovdqu8	(%rdx),%zmm0
+	vpshufb	%zmm4,%zmm0,%zmm0
+	vpxord	%zmm0,%zmm5,%zmm5
+	vpclmulqdq	$0x00,%zmm9,%zmm5,%zmm0
+	vpclmulqdq	$0x01,%zmm9,%zmm5,%zmm1
+	vpclmulqdq	$0x10,%zmm9,%zmm5,%zmm2
+	vpxord	%zmm2,%zmm1,%zmm1
+	vpclmulqdq	$0x01,%zmm0,%zmm10,%zmm2
+	vpshufd	$0x4e,%zmm0,%zmm0
+	vpternlogd	$0x96,%zmm2,%zmm0,%zmm1
+	vpclmulqdq	$0x11,%zmm9,%zmm5,%zmm5
+	vpclmulqdq	$0x01,%zmm1,%zmm10,%zmm0
+	vpshufd	$0x4e,%zmm1,%zmm1
+	vpternlogd	$0x96,%zmm0,%zmm1,%zmm5
+
+	vextracti32x4	$1,%zmm5,%xmm0
+	vextracti32x4	$2,%zmm5,%xmm1
+	vextracti32x4	$3,%zmm5,%xmm2
+	vpxord	%xmm0,%xmm5,%xmm5
+	vpternlogd	$0x96,%xmm1,%xmm2,%xmm5
+
+	addq	$64,%rdx
+	subq	$64,%rcx
+	cmpq	$64,%rcx
+	jae	.Laad_loop_1x__func2
+
+.Laad_large_done__func2:
+
+
+	vzeroupper
+
+
+.Laad_blockbyblock__func2:
+	testq	%rcx,%rcx
+	jz	.Laad_done__func2
+	vmovdqu	256-16(%rsi),%xmm9
+.Laad_loop_blockbyblock__func2:
+	vmovdqu	(%rdx),%xmm0
+	vpshufb	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm9,%xmm5,%xmm0
+	vpclmulqdq	$0x01,%xmm9,%xmm5,%xmm1
+	vpclmulqdq	$0x10,%xmm9,%xmm5,%xmm2
+	vpxord	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm10,%xmm2
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpternlogd	$0x96,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x11,%xmm9,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm1,%xmm10,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpternlogd	$0x96,%xmm0,%xmm1,%xmm5
+
+	addq	$16,%rdx
+	subq	$16,%rcx
+	jnz	.Laad_loop_blockbyblock__func2
+
+.Laad_done__func2:
+
+	vpshufb	%xmm4,%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rdi)
+	ret
+
+.cfi_endproc	
+.size	gcm_ghash_vpclmulqdq_avx10_512, . - gcm_ghash_vpclmulqdq_avx10_512
+.globl	aes_gcm_enc_update_vaes_avx10_512
+.hidden aes_gcm_enc_update_vaes_avx10_512
+.type	aes_gcm_enc_update_vaes_avx10_512,@function
+.align	32
+aes_gcm_enc_update_vaes_avx10_512:
+.cfi_startproc	
+
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+7(%rip)
+#endif
+
+	vbroadcasti32x4	.Lbswap_mask(%rip),%zmm8
+	vbroadcasti32x4	.Lgfpoly(%rip),%zmm31
+
+
+
+	vmovdqu	(%r12),%xmm10
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vbroadcasti32x4	(%r8),%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm12
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti32x4	(%rcx),%zmm13
+	vbroadcasti32x4	(%r11),%zmm14
+
+
+	vpaddd	.Lctr_pattern(%rip),%zmm12,%zmm12
+
+
+	vbroadcasti32x4	.Linc_4blocks(%rip),%zmm11
+
+
+
+	cmpq	$256-1,%rdx
+	jbe	.Lcrypt_loop_4x_done__func3
+
+
+	vmovdqu8	256-256(%r9),%zmm27
+	vmovdqu8	256-192(%r9),%zmm28
+	vmovdqu8	256-128(%r9),%zmm29
+	vmovdqu8	256-64(%r9),%zmm30
+
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm1
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm2
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm3
+	vpaddd	%zmm11,%zmm12,%zmm12
+
+
+	vpxord	%zmm13,%zmm0,%zmm0
+	vpxord	%zmm13,%zmm1,%zmm1
+	vpxord	%zmm13,%zmm2,%zmm2
+	vpxord	%zmm13,%zmm3,%zmm3
+
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_first_4_vecs__func3:
+	vbroadcasti32x4	(%rax),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_first_4_vecs__func3
+
+
+
+	vpxord	0(%rdi),%zmm14,%zmm4
+	vpxord	64(%rdi),%zmm14,%zmm5
+	vpxord	128(%rdi),%zmm14,%zmm6
+	vpxord	192(%rdi),%zmm14,%zmm7
+
+
+
+	vaesenclast	%zmm4,%zmm0,%zmm4
+	vaesenclast	%zmm5,%zmm1,%zmm5
+	vaesenclast	%zmm6,%zmm2,%zmm6
+	vaesenclast	%zmm7,%zmm3,%zmm7
+
+
+	vmovdqu8	%zmm4,0(%rsi)
+	vmovdqu8	%zmm5,64(%rsi)
+	vmovdqu8	%zmm6,128(%rsi)
+	vmovdqu8	%zmm7,192(%rsi)
+
+	subq	$-256,%rdi
+	subq	$-256,%rsi
+	addq	$-256,%rdx
+	cmpq	$256-1,%rdx
+	jbe	.Lghash_last_ciphertext_4x__func3
+	vbroadcasti32x4	-144(%r11),%zmm15
+	vbroadcasti32x4	-128(%r11),%zmm16
+	vbroadcasti32x4	-112(%r11),%zmm17
+	vbroadcasti32x4	-96(%r11),%zmm18
+	vbroadcasti32x4	-80(%r11),%zmm19
+	vbroadcasti32x4	-64(%r11),%zmm20
+	vbroadcasti32x4	-48(%r11),%zmm21
+	vbroadcasti32x4	-32(%r11),%zmm22
+	vbroadcasti32x4	-16(%r11),%zmm23
+.Lcrypt_loop_4x__func3:
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm1
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm2
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm3
+	vpaddd	%zmm11,%zmm12,%zmm12
+
+
+	vpxord	%zmm13,%zmm0,%zmm0
+	vpxord	%zmm13,%zmm1,%zmm1
+	vpxord	%zmm13,%zmm2,%zmm2
+	vpxord	%zmm13,%zmm3,%zmm3
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func3
+	je	.Laes192__func3
+
+	vbroadcasti32x4	-208(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+	vbroadcasti32x4	-192(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+.Laes192__func3:
+	vbroadcasti32x4	-176(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+	vbroadcasti32x4	-160(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+.Laes128__func3:
+	vpshufb	%zmm8,%zmm4,%zmm4
+	vpxord	%zmm10,%zmm4,%zmm4
+	vpshufb	%zmm8,%zmm5,%zmm5
+	vpshufb	%zmm8,%zmm6,%zmm6
+
+	vaesenc	%zmm15,%zmm0,%zmm0
+	vaesenc	%zmm15,%zmm1,%zmm1
+	vaesenc	%zmm15,%zmm2,%zmm2
+	vaesenc	%zmm15,%zmm3,%zmm3
+
+	vpshufb	%zmm8,%zmm7,%zmm7
+	vpclmulqdq	$0x00,%zmm27,%zmm4,%zmm10
+	vpclmulqdq	$0x00,%zmm28,%zmm5,%zmm24
+	vpclmulqdq	$0x00,%zmm29,%zmm6,%zmm25
+
+	vaesenc	%zmm16,%zmm0,%zmm0
+	vaesenc	%zmm16,%zmm1,%zmm1
+	vaesenc	%zmm16,%zmm2,%zmm2
+	vaesenc	%zmm16,%zmm3,%zmm3
+
+	vpxord	%zmm24,%zmm10,%zmm10
+	vpclmulqdq	$0x00,%zmm30,%zmm7,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm10
+	vpclmulqdq	$0x01,%zmm27,%zmm4,%zmm24
+
+	vaesenc	%zmm17,%zmm0,%zmm0
+	vaesenc	%zmm17,%zmm1,%zmm1
+	vaesenc	%zmm17,%zmm2,%zmm2
+	vaesenc	%zmm17,%zmm3,%zmm3
+
+	vpclmulqdq	$0x01,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x01,%zmm29,%zmm6,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm30,%zmm7,%zmm25
+
+	vaesenc	%zmm18,%zmm0,%zmm0
+	vaesenc	%zmm18,%zmm1,%zmm1
+	vaesenc	%zmm18,%zmm2,%zmm2
+	vaesenc	%zmm18,%zmm3,%zmm3
+
+	vpclmulqdq	$0x10,%zmm27,%zmm4,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x10,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x10,%zmm29,%zmm6,%zmm26
+
+	vaesenc	%zmm19,%zmm0,%zmm0
+	vaesenc	%zmm19,%zmm1,%zmm1
+	vaesenc	%zmm19,%zmm2,%zmm2
+	vaesenc	%zmm19,%zmm3,%zmm3
+
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm10,%zmm31,%zmm26
+	vpclmulqdq	$0x10,%zmm30,%zmm7,%zmm25
+	vpxord	%zmm25,%zmm24,%zmm24
+
+	vaesenc	%zmm20,%zmm0,%zmm0
+	vaesenc	%zmm20,%zmm1,%zmm1
+	vaesenc	%zmm20,%zmm2,%zmm2
+	vaesenc	%zmm20,%zmm3,%zmm3
+
+	vpshufd	$0x4e,%zmm10,%zmm10
+	vpclmulqdq	$0x11,%zmm27,%zmm4,%zmm4
+	vpclmulqdq	$0x11,%zmm28,%zmm5,%zmm5
+	vpclmulqdq	$0x11,%zmm29,%zmm6,%zmm6
+
+	vaesenc	%zmm21,%zmm0,%zmm0
+	vaesenc	%zmm21,%zmm1,%zmm1
+	vaesenc	%zmm21,%zmm2,%zmm2
+	vaesenc	%zmm21,%zmm3,%zmm3
+
+	vpternlogd	$0x96,%zmm26,%zmm10,%zmm24
+	vpclmulqdq	$0x11,%zmm30,%zmm7,%zmm7
+	vpternlogd	$0x96,%zmm6,%zmm5,%zmm4
+	vpclmulqdq	$0x01,%zmm24,%zmm31,%zmm25
+
+	vaesenc	%zmm22,%zmm0,%zmm0
+	vaesenc	%zmm22,%zmm1,%zmm1
+	vaesenc	%zmm22,%zmm2,%zmm2
+	vaesenc	%zmm22,%zmm3,%zmm3
+
+	vpxord	%zmm7,%zmm4,%zmm10
+	vpshufd	$0x4e,%zmm24,%zmm24
+	vpternlogd	$0x96,%zmm25,%zmm24,%zmm10
+
+	vaesenc	%zmm23,%zmm0,%zmm0
+	vaesenc	%zmm23,%zmm1,%zmm1
+	vaesenc	%zmm23,%zmm2,%zmm2
+	vaesenc	%zmm23,%zmm3,%zmm3
+
+	vextracti32x4	$1,%zmm10,%xmm4
+	vextracti32x4	$2,%zmm10,%xmm5
+	vextracti32x4	$3,%zmm10,%xmm6
+	vpxord	%xmm4,%xmm10,%xmm10
+	vpternlogd	$0x96,%xmm5,%xmm6,%xmm10
+
+
+
+
+	vpxord	0(%rdi),%zmm14,%zmm4
+	vpxord	64(%rdi),%zmm14,%zmm5
+	vpxord	128(%rdi),%zmm14,%zmm6
+	vpxord	192(%rdi),%zmm14,%zmm7
+
+
+
+	vaesenclast	%zmm4,%zmm0,%zmm4
+	vaesenclast	%zmm5,%zmm1,%zmm5
+	vaesenclast	%zmm6,%zmm2,%zmm6
+	vaesenclast	%zmm7,%zmm3,%zmm7
+
+
+	vmovdqu8	%zmm4,0(%rsi)
+	vmovdqu8	%zmm5,64(%rsi)
+	vmovdqu8	%zmm6,128(%rsi)
+	vmovdqu8	%zmm7,192(%rsi)
+
+	subq	$-256,%rdi
+	subq	$-256,%rsi
+	addq	$-256,%rdx
+	cmpq	$256-1,%rdx
+	ja	.Lcrypt_loop_4x__func3
+.Lghash_last_ciphertext_4x__func3:
+	vpshufb	%zmm8,%zmm4,%zmm4
+	vpxord	%zmm10,%zmm4,%zmm4
+	vpshufb	%zmm8,%zmm5,%zmm5
+	vpshufb	%zmm8,%zmm6,%zmm6
+	vpshufb	%zmm8,%zmm7,%zmm7
+	vpclmulqdq	$0x00,%zmm27,%zmm4,%zmm10
+	vpclmulqdq	$0x00,%zmm28,%zmm5,%zmm24
+	vpclmulqdq	$0x00,%zmm29,%zmm6,%zmm25
+	vpxord	%zmm24,%zmm10,%zmm10
+	vpclmulqdq	$0x00,%zmm30,%zmm7,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm10
+	vpclmulqdq	$0x01,%zmm27,%zmm4,%zmm24
+	vpclmulqdq	$0x01,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x01,%zmm29,%zmm6,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm30,%zmm7,%zmm25
+	vpclmulqdq	$0x10,%zmm27,%zmm4,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x10,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x10,%zmm29,%zmm6,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm10,%zmm31,%zmm26
+	vpclmulqdq	$0x10,%zmm30,%zmm7,%zmm25
+	vpxord	%zmm25,%zmm24,%zmm24
+	vpshufd	$0x4e,%zmm10,%zmm10
+	vpclmulqdq	$0x11,%zmm27,%zmm4,%zmm4
+	vpclmulqdq	$0x11,%zmm28,%zmm5,%zmm5
+	vpclmulqdq	$0x11,%zmm29,%zmm6,%zmm6
+	vpternlogd	$0x96,%zmm26,%zmm10,%zmm24
+	vpclmulqdq	$0x11,%zmm30,%zmm7,%zmm7
+	vpternlogd	$0x96,%zmm6,%zmm5,%zmm4
+	vpclmulqdq	$0x01,%zmm24,%zmm31,%zmm25
+	vpxord	%zmm7,%zmm4,%zmm10
+	vpshufd	$0x4e,%zmm24,%zmm24
+	vpternlogd	$0x96,%zmm25,%zmm24,%zmm10
+	vextracti32x4	$1,%zmm10,%xmm4
+	vextracti32x4	$2,%zmm10,%xmm5
+	vextracti32x4	$3,%zmm10,%xmm6
+	vpxord	%xmm4,%xmm10,%xmm10
+	vpternlogd	$0x96,%xmm5,%xmm6,%xmm10
+
+.Lcrypt_loop_4x_done__func3:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdx,%rax
+	negq	%rax
+	andq	$-16,%rax
+	leaq	256(%r9,%rax,1),%r8
+	vpxor	%xmm4,%xmm4,%xmm4
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+
+	cmpq	$64,%rdx
+	jb	.Lpartial_vec__func3
+
+.Lcrypt_loop_1x__func3:
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpxord	%zmm13,%zmm0,%zmm0
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_full_vec__func3:
+	vbroadcasti32x4	(%rax),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_full_vec__func3
+	vaesenclast	%zmm14,%zmm0,%zmm0
+
+
+	vmovdqu8	(%rdi),%zmm1
+	vpxord	%zmm1,%zmm0,%zmm0
+	vmovdqu8	%zmm0,(%rsi)
+
+
+	vmovdqu8	(%r8),%zmm30
+	vpshufb	%zmm8,%zmm0,%zmm0
+	vpxord	%zmm10,%zmm0,%zmm0
+	vpclmulqdq	$0x00,%zmm30,%zmm0,%zmm7
+	vpclmulqdq	$0x01,%zmm30,%zmm0,%zmm1
+	vpclmulqdq	$0x10,%zmm30,%zmm0,%zmm2
+	vpclmulqdq	$0x11,%zmm30,%zmm0,%zmm3
+	vpxord	%zmm7,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm2,%zmm1,%zmm5
+	vpxord	%zmm3,%zmm6,%zmm6
+
+	vpxor	%xmm10,%xmm10,%xmm10
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	cmpq	$64,%rdx
+	jae	.Lcrypt_loop_1x__func3
+
+	testq	%rdx,%rdx
+	jz	.Lreduce__func3
+
+.Lpartial_vec__func3:
+
+
+
+
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovq	%rax,%k1
+	addq	$15,%rdx
+	andq	$-16,%rdx
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovq	%rax,%k2
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpxord	%zmm13,%zmm0,%zmm0
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_partialvec__func3:
+	vbroadcasti32x4	(%rax),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_partialvec__func3
+	vaesenclast	%zmm14,%zmm0,%zmm0
+
+
+	vmovdqu8	(%rdi),%zmm1{%k1}{z}
+	vpxord	%zmm1,%zmm0,%zmm0
+	vmovdqu8	%zmm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	(%r8),%zmm30{%k2}{z}
+	vmovdqu8	%zmm0,%zmm1{%k1}{z}
+	vpshufb	%zmm8,%zmm1,%zmm0
+	vpxord	%zmm10,%zmm0,%zmm0
+	vpclmulqdq	$0x00,%zmm30,%zmm0,%zmm7
+	vpclmulqdq	$0x01,%zmm30,%zmm0,%zmm1
+	vpclmulqdq	$0x10,%zmm30,%zmm0,%zmm2
+	vpclmulqdq	$0x11,%zmm30,%zmm0,%zmm3
+	vpxord	%zmm7,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm2,%zmm1,%zmm5
+	vpxord	%zmm3,%zmm6,%zmm6
+
+
+.Lreduce__func3:
+
+	vpclmulqdq	$0x01,%zmm4,%zmm31,%zmm0
+	vpshufd	$0x4e,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm0,%zmm4,%zmm5
+	vpclmulqdq	$0x01,%zmm5,%zmm31,%zmm0
+	vpshufd	$0x4e,%zmm5,%zmm5
+	vpternlogd	$0x96,%zmm0,%zmm5,%zmm6
+
+	vextracti32x4	$1,%zmm6,%xmm0
+	vextracti32x4	$2,%zmm6,%xmm1
+	vextracti32x4	$3,%zmm6,%xmm2
+	vpxord	%xmm0,%xmm6,%xmm10
+	vpternlogd	$0x96,%xmm1,%xmm2,%xmm10
+
+
+.Ldone__func3:
+
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+
+.cfi_endproc	
+.size	aes_gcm_enc_update_vaes_avx10_512, . - aes_gcm_enc_update_vaes_avx10_512
+.globl	aes_gcm_dec_update_vaes_avx10_512
+.hidden aes_gcm_dec_update_vaes_avx10_512
+.type	aes_gcm_dec_update_vaes_avx10_512,@function
+.align	32
+aes_gcm_dec_update_vaes_avx10_512:
+.cfi_startproc	
+
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+
+	vbroadcasti32x4	.Lbswap_mask(%rip),%zmm8
+	vbroadcasti32x4	.Lgfpoly(%rip),%zmm31
+
+
+
+	vmovdqu	(%r12),%xmm10
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vbroadcasti32x4	(%r8),%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm12
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti32x4	(%rcx),%zmm13
+	vbroadcasti32x4	(%r11),%zmm14
+
+
+	vpaddd	.Lctr_pattern(%rip),%zmm12,%zmm12
+
+
+	vbroadcasti32x4	.Linc_4blocks(%rip),%zmm11
+
+
+
+	cmpq	$256-1,%rdx
+	jbe	.Lcrypt_loop_4x_done__func4
+
+
+	vmovdqu8	256-256(%r9),%zmm27
+	vmovdqu8	256-192(%r9),%zmm28
+	vmovdqu8	256-128(%r9),%zmm29
+	vmovdqu8	256-64(%r9),%zmm30
+	vbroadcasti32x4	-144(%r11),%zmm15
+	vbroadcasti32x4	-128(%r11),%zmm16
+	vbroadcasti32x4	-112(%r11),%zmm17
+	vbroadcasti32x4	-96(%r11),%zmm18
+	vbroadcasti32x4	-80(%r11),%zmm19
+	vbroadcasti32x4	-64(%r11),%zmm20
+	vbroadcasti32x4	-48(%r11),%zmm21
+	vbroadcasti32x4	-32(%r11),%zmm22
+	vbroadcasti32x4	-16(%r11),%zmm23
+.Lcrypt_loop_4x__func4:
+	vmovdqu8	0(%rdi),%zmm4
+	vmovdqu8	64(%rdi),%zmm5
+	vmovdqu8	128(%rdi),%zmm6
+	vmovdqu8	192(%rdi),%zmm7
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm1
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm2
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpshufb	%zmm8,%zmm12,%zmm3
+	vpaddd	%zmm11,%zmm12,%zmm12
+
+
+	vpxord	%zmm13,%zmm0,%zmm0
+	vpxord	%zmm13,%zmm1,%zmm1
+	vpxord	%zmm13,%zmm2,%zmm2
+	vpxord	%zmm13,%zmm3,%zmm3
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func4
+	je	.Laes192__func4
+
+	vbroadcasti32x4	-208(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+	vbroadcasti32x4	-192(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+.Laes192__func4:
+	vbroadcasti32x4	-176(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+	vbroadcasti32x4	-160(%r11),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	vaesenc	%zmm9,%zmm1,%zmm1
+	vaesenc	%zmm9,%zmm2,%zmm2
+	vaesenc	%zmm9,%zmm3,%zmm3
+
+.Laes128__func4:
+	vpshufb	%zmm8,%zmm4,%zmm4
+	vpxord	%zmm10,%zmm4,%zmm4
+	vpshufb	%zmm8,%zmm5,%zmm5
+	vpshufb	%zmm8,%zmm6,%zmm6
+
+	vaesenc	%zmm15,%zmm0,%zmm0
+	vaesenc	%zmm15,%zmm1,%zmm1
+	vaesenc	%zmm15,%zmm2,%zmm2
+	vaesenc	%zmm15,%zmm3,%zmm3
+
+	vpshufb	%zmm8,%zmm7,%zmm7
+	vpclmulqdq	$0x00,%zmm27,%zmm4,%zmm10
+	vpclmulqdq	$0x00,%zmm28,%zmm5,%zmm24
+	vpclmulqdq	$0x00,%zmm29,%zmm6,%zmm25
+
+	vaesenc	%zmm16,%zmm0,%zmm0
+	vaesenc	%zmm16,%zmm1,%zmm1
+	vaesenc	%zmm16,%zmm2,%zmm2
+	vaesenc	%zmm16,%zmm3,%zmm3
+
+	vpxord	%zmm24,%zmm10,%zmm10
+	vpclmulqdq	$0x00,%zmm30,%zmm7,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm10
+	vpclmulqdq	$0x01,%zmm27,%zmm4,%zmm24
+
+	vaesenc	%zmm17,%zmm0,%zmm0
+	vaesenc	%zmm17,%zmm1,%zmm1
+	vaesenc	%zmm17,%zmm2,%zmm2
+	vaesenc	%zmm17,%zmm3,%zmm3
+
+	vpclmulqdq	$0x01,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x01,%zmm29,%zmm6,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm30,%zmm7,%zmm25
+
+	vaesenc	%zmm18,%zmm0,%zmm0
+	vaesenc	%zmm18,%zmm1,%zmm1
+	vaesenc	%zmm18,%zmm2,%zmm2
+	vaesenc	%zmm18,%zmm3,%zmm3
+
+	vpclmulqdq	$0x10,%zmm27,%zmm4,%zmm26
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x10,%zmm28,%zmm5,%zmm25
+	vpclmulqdq	$0x10,%zmm29,%zmm6,%zmm26
+
+	vaesenc	%zmm19,%zmm0,%zmm0
+	vaesenc	%zmm19,%zmm1,%zmm1
+	vaesenc	%zmm19,%zmm2,%zmm2
+	vaesenc	%zmm19,%zmm3,%zmm3
+
+	vpternlogd	$0x96,%zmm26,%zmm25,%zmm24
+	vpclmulqdq	$0x01,%zmm10,%zmm31,%zmm26
+	vpclmulqdq	$0x10,%zmm30,%zmm7,%zmm25
+	vpxord	%zmm25,%zmm24,%zmm24
+
+	vaesenc	%zmm20,%zmm0,%zmm0
+	vaesenc	%zmm20,%zmm1,%zmm1
+	vaesenc	%zmm20,%zmm2,%zmm2
+	vaesenc	%zmm20,%zmm3,%zmm3
+
+	vpshufd	$0x4e,%zmm10,%zmm10
+	vpclmulqdq	$0x11,%zmm27,%zmm4,%zmm4
+	vpclmulqdq	$0x11,%zmm28,%zmm5,%zmm5
+	vpclmulqdq	$0x11,%zmm29,%zmm6,%zmm6
+
+	vaesenc	%zmm21,%zmm0,%zmm0
+	vaesenc	%zmm21,%zmm1,%zmm1
+	vaesenc	%zmm21,%zmm2,%zmm2
+	vaesenc	%zmm21,%zmm3,%zmm3
+
+	vpternlogd	$0x96,%zmm26,%zmm10,%zmm24
+	vpclmulqdq	$0x11,%zmm30,%zmm7,%zmm7
+	vpternlogd	$0x96,%zmm6,%zmm5,%zmm4
+	vpclmulqdq	$0x01,%zmm24,%zmm31,%zmm25
+
+	vaesenc	%zmm22,%zmm0,%zmm0
+	vaesenc	%zmm22,%zmm1,%zmm1
+	vaesenc	%zmm22,%zmm2,%zmm2
+	vaesenc	%zmm22,%zmm3,%zmm3
+
+	vpxord	%zmm7,%zmm4,%zmm10
+	vpshufd	$0x4e,%zmm24,%zmm24
+	vpternlogd	$0x96,%zmm25,%zmm24,%zmm10
+
+	vaesenc	%zmm23,%zmm0,%zmm0
+	vaesenc	%zmm23,%zmm1,%zmm1
+	vaesenc	%zmm23,%zmm2,%zmm2
+	vaesenc	%zmm23,%zmm3,%zmm3
+
+	vextracti32x4	$1,%zmm10,%xmm4
+	vextracti32x4	$2,%zmm10,%xmm5
+	vextracti32x4	$3,%zmm10,%xmm6
+	vpxord	%xmm4,%xmm10,%xmm10
+	vpternlogd	$0x96,%xmm5,%xmm6,%xmm10
+
+
+
+
+	vpxord	0(%rdi),%zmm14,%zmm4
+	vpxord	64(%rdi),%zmm14,%zmm5
+	vpxord	128(%rdi),%zmm14,%zmm6
+	vpxord	192(%rdi),%zmm14,%zmm7
+
+
+
+	vaesenclast	%zmm4,%zmm0,%zmm4
+	vaesenclast	%zmm5,%zmm1,%zmm5
+	vaesenclast	%zmm6,%zmm2,%zmm6
+	vaesenclast	%zmm7,%zmm3,%zmm7
+
+
+	vmovdqu8	%zmm4,0(%rsi)
+	vmovdqu8	%zmm5,64(%rsi)
+	vmovdqu8	%zmm6,128(%rsi)
+	vmovdqu8	%zmm7,192(%rsi)
+
+	subq	$-256,%rdi
+	subq	$-256,%rsi
+	addq	$-256,%rdx
+	cmpq	$256-1,%rdx
+	ja	.Lcrypt_loop_4x__func4
+.Lcrypt_loop_4x_done__func4:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdx,%rax
+	negq	%rax
+	andq	$-16,%rax
+	leaq	256(%r9,%rax,1),%r8
+	vpxor	%xmm4,%xmm4,%xmm4
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+
+	cmpq	$64,%rdx
+	jb	.Lpartial_vec__func4
+
+.Lcrypt_loop_1x__func4:
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpaddd	%zmm11,%zmm12,%zmm12
+	vpxord	%zmm13,%zmm0,%zmm0
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_full_vec__func4:
+	vbroadcasti32x4	(%rax),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_full_vec__func4
+	vaesenclast	%zmm14,%zmm0,%zmm0
+
+
+	vmovdqu8	(%rdi),%zmm1
+	vpxord	%zmm1,%zmm0,%zmm0
+	vmovdqu8	%zmm0,(%rsi)
+
+
+	vmovdqu8	(%r8),%zmm30
+	vpshufb	%zmm8,%zmm1,%zmm0
+	vpxord	%zmm10,%zmm0,%zmm0
+	vpclmulqdq	$0x00,%zmm30,%zmm0,%zmm7
+	vpclmulqdq	$0x01,%zmm30,%zmm0,%zmm1
+	vpclmulqdq	$0x10,%zmm30,%zmm0,%zmm2
+	vpclmulqdq	$0x11,%zmm30,%zmm0,%zmm3
+	vpxord	%zmm7,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm2,%zmm1,%zmm5
+	vpxord	%zmm3,%zmm6,%zmm6
+
+	vpxor	%xmm10,%xmm10,%xmm10
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	cmpq	$64,%rdx
+	jae	.Lcrypt_loop_1x__func4
+
+	testq	%rdx,%rdx
+	jz	.Lreduce__func4
+
+.Lpartial_vec__func4:
+
+
+
+
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovq	%rax,%k1
+	addq	$15,%rdx
+	andq	$-16,%rdx
+	movq	$-1,%rax
+	bzhiq	%rdx,%rax,%rax
+	kmovq	%rax,%k2
+
+
+
+	vpshufb	%zmm8,%zmm12,%zmm0
+	vpxord	%zmm13,%zmm0,%zmm0
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_partialvec__func4:
+	vbroadcasti32x4	(%rax),%zmm9
+	vaesenc	%zmm9,%zmm0,%zmm0
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_partialvec__func4
+	vaesenclast	%zmm14,%zmm0,%zmm0
+
+
+	vmovdqu8	(%rdi),%zmm1{%k1}{z}
+	vpxord	%zmm1,%zmm0,%zmm0
+	vmovdqu8	%zmm0,(%rsi){%k1}
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	(%r8),%zmm30{%k2}{z}
+
+	vpshufb	%zmm8,%zmm1,%zmm0
+	vpxord	%zmm10,%zmm0,%zmm0
+	vpclmulqdq	$0x00,%zmm30,%zmm0,%zmm7
+	vpclmulqdq	$0x01,%zmm30,%zmm0,%zmm1
+	vpclmulqdq	$0x10,%zmm30,%zmm0,%zmm2
+	vpclmulqdq	$0x11,%zmm30,%zmm0,%zmm3
+	vpxord	%zmm7,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm2,%zmm1,%zmm5
+	vpxord	%zmm3,%zmm6,%zmm6
+
+
+.Lreduce__func4:
+
+	vpclmulqdq	$0x01,%zmm4,%zmm31,%zmm0
+	vpshufd	$0x4e,%zmm4,%zmm4
+	vpternlogd	$0x96,%zmm0,%zmm4,%zmm5
+	vpclmulqdq	$0x01,%zmm5,%zmm31,%zmm0
+	vpshufd	$0x4e,%zmm5,%zmm5
+	vpternlogd	$0x96,%zmm0,%zmm5,%zmm6
+
+	vextracti32x4	$1,%zmm6,%xmm0
+	vextracti32x4	$2,%zmm6,%xmm1
+	vextracti32x4	$3,%zmm6,%xmm2
+	vpxord	%xmm0,%xmm6,%xmm10
+	vpternlogd	$0x96,%xmm1,%xmm2,%xmm10
+
+
+.Ldone__func4:
+
+	vpshufb	%xmm8,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+
+.cfi_endproc	
+.size	aes_gcm_dec_update_vaes_avx10_512, . - aes_gcm_dec_update_vaes_avx10_512
+#endif
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-win.asm b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
new file mode 100644
index 0000000..258f923
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
@@ -0,0 +1,2790 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.rdata rdata align=8
+ALIGN	64
+
+
+$L$bswap_mask:
+	DQ	0x08090a0b0c0d0e0f,0x0001020304050607
+
+
+
+
+
+
+
+
+$L$gfpoly:
+	DQ	1,0xc200000000000000
+
+
+$L$gfpoly_and_internal_carrybit:
+	DQ	1,0xc200000000000001
+
+
+
+
+
+$L$ctr_pattern:
+	DQ	0,0
+	DQ	1,0
+$L$inc_2blocks:
+	DQ	2,0
+	DQ	3,0
+$L$inc_4blocks:
+	DQ	4,0
+
+section	.text code align=64
+
+global	gcm_gmult_vpclmulqdq_avx10
+
+ALIGN	32
+gcm_gmult_vpclmulqdq_avx10:
+
+$L$SEH_begin_gcm_gmult_vpclmulqdq_avx10_1:
+_CET_ENDBR
+	sub	rsp,24
+$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx10_2:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx10_3:
+
+$L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx10_4:
+
+	vmovdqu	xmm0,XMMWORD[rcx]
+	vmovdqu	xmm1,XMMWORD[$L$bswap_mask]
+	vmovdqu	xmm2,XMMWORD[((256-16))+rdx]
+	vmovdqu	xmm3,XMMWORD[$L$gfpoly]
+	vpshufb	xmm0,xmm0,xmm1
+
+	vpclmulqdq	xmm4,xmm0,xmm2,0x00
+	vpclmulqdq	xmm5,xmm0,xmm2,0x01
+	vpclmulqdq	xmm6,xmm0,xmm2,0x10
+	vpxord	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm6,xmm3,xmm4,0x01
+	vpshufd	xmm4,xmm4,0x4e
+	vpternlogd	xmm5,xmm4,xmm6,0x96
+	vpclmulqdq	xmm0,xmm0,xmm2,0x11
+	vpclmulqdq	xmm4,xmm3,xmm5,0x01
+	vpshufd	xmm5,xmm5,0x4e
+	vpternlogd	xmm0,xmm5,xmm4,0x96
+
+
+	vpshufb	xmm0,xmm0,xmm1
+	vmovdqu	XMMWORD[rcx],xmm0
+	movdqa	xmm6,XMMWORD[rsp]
+	add	rsp,24
+	ret
+$L$SEH_end_gcm_gmult_vpclmulqdq_avx10_5:
+
+
+global	gcm_init_vpclmulqdq_avx10
+
+ALIGN	32
+gcm_init_vpclmulqdq_avx10:
+
+
+_CET_ENDBR
+
+	lea	r8,[((256-32))+rcx]
+
+
+
+	vpshufd	xmm3,XMMWORD[rdx],0x4e
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vpshufd	xmm0,xmm3,0xd3
+	vpsrad	xmm0,xmm0,31
+	vpaddq	xmm3,xmm3,xmm3
+
+	vpternlogd	xmm3,xmm0,XMMWORD[$L$gfpoly_and_internal_carrybit],0x78
+
+
+	vbroadcasti32x4	ymm5,YMMWORD[$L$gfpoly]
+
+
+
+
+
+
+
+
+	vpclmulqdq	xmm0,xmm3,xmm3,0x00
+	vpclmulqdq	xmm1,xmm3,xmm3,0x01
+	vpclmulqdq	xmm2,xmm3,xmm3,0x10
+	vpxord	xmm1,xmm1,xmm2
+	vpclmulqdq	xmm2,xmm5,xmm0,0x01
+	vpshufd	xmm0,xmm0,0x4e
+	vpternlogd	xmm1,xmm0,xmm2,0x96
+	vpclmulqdq	xmm4,xmm3,xmm3,0x11
+	vpclmulqdq	xmm0,xmm5,xmm1,0x01
+	vpshufd	xmm1,xmm1,0x4e
+	vpternlogd	xmm4,xmm1,xmm0,0x96
+
+
+
+	vinserti128	ymm3,ymm4,xmm3,1
+	vinserti128	ymm4,ymm4,xmm4,1
+
+	vmovdqu8	YMMWORD[r8],ymm3
+
+
+
+
+
+	mov	eax,7
+$L$precompute_next__func1:
+	sub	r8,32
+	vpclmulqdq	ymm0,ymm3,ymm4,0x00
+	vpclmulqdq	ymm1,ymm3,ymm4,0x01
+	vpclmulqdq	ymm2,ymm3,ymm4,0x10
+	vpxord	ymm1,ymm1,ymm2
+	vpclmulqdq	ymm2,ymm5,ymm0,0x01
+	vpshufd	ymm0,ymm0,0x4e
+	vpternlogd	ymm1,ymm0,ymm2,0x96
+	vpclmulqdq	ymm3,ymm3,ymm4,0x11
+	vpclmulqdq	ymm0,ymm5,ymm1,0x01
+	vpshufd	ymm1,ymm1,0x4e
+	vpternlogd	ymm3,ymm1,ymm0,0x96
+
+	vmovdqu8	YMMWORD[r8],ymm3
+	dec	eax
+	jnz	NEAR $L$precompute_next__func1
+
+	vzeroupper
+	ret
+
+
+
+global	gcm_ghash_vpclmulqdq_avx10_256
+
+ALIGN	32
+gcm_ghash_vpclmulqdq_avx10_256:
+
+$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1:
+_CET_ENDBR
+	sub	rsp,136
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_2:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_3:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_4:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_5:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_6:
+	movdqa	XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_7:
+	movdqa	XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_8:
+	movdqa	XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_9:
+	movdqa	XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_10:
+
+$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx10_256_11:
+
+
+
+
+	vmovdqu	xmm4,XMMWORD[$L$bswap_mask]
+	vmovdqu	xmm10,XMMWORD[$L$gfpoly]
+
+
+	vmovdqu	xmm5,XMMWORD[rcx]
+	vpshufb	xmm5,xmm5,xmm4
+
+
+	cmp	r9,32
+	jb	NEAR $L$aad_blockbyblock__func1
+
+
+
+	vshufi64x2	ymm4,ymm4,ymm4,0
+	vshufi64x2	ymm10,ymm10,ymm10,0
+
+
+	vmovdqu8	ymm9,YMMWORD[((256-32))+rdx]
+
+	cmp	r9,4*32-1
+	jbe	NEAR $L$aad_loop_1x__func1
+
+
+	vmovdqu8	ymm6,YMMWORD[((256-128))+rdx]
+	vmovdqu8	ymm7,YMMWORD[((256-96))+rdx]
+	vmovdqu8	ymm8,YMMWORD[((256-64))+rdx]
+
+
+$L$aad_loop_4x__func1:
+	vmovdqu8	ymm0,YMMWORD[r8]
+	vmovdqu8	ymm1,YMMWORD[32+r8]
+	vmovdqu8	ymm2,YMMWORD[64+r8]
+	vmovdqu8	ymm3,YMMWORD[96+r8]
+	vpshufb	ymm0,ymm0,ymm4
+	vpxord	ymm0,ymm0,ymm5
+	vpshufb	ymm1,ymm1,ymm4
+	vpshufb	ymm2,ymm2,ymm4
+	vpshufb	ymm3,ymm3,ymm4
+	vpclmulqdq	ymm5,ymm0,ymm6,0x00
+	vpclmulqdq	ymm11,ymm1,ymm7,0x00
+	vpclmulqdq	ymm12,ymm2,ymm8,0x00
+	vpxord	ymm5,ymm5,ymm11
+	vpclmulqdq	ymm13,ymm3,ymm9,0x00
+	vpternlogd	ymm5,ymm12,ymm13,0x96
+	vpclmulqdq	ymm11,ymm0,ymm6,0x01
+	vpclmulqdq	ymm12,ymm1,ymm7,0x01
+	vpclmulqdq	ymm13,ymm2,ymm8,0x01
+	vpternlogd	ymm11,ymm12,ymm13,0x96
+	vpclmulqdq	ymm12,ymm3,ymm9,0x01
+	vpclmulqdq	ymm13,ymm0,ymm6,0x10
+	vpternlogd	ymm11,ymm12,ymm13,0x96
+	vpclmulqdq	ymm12,ymm1,ymm7,0x10
+	vpclmulqdq	ymm13,ymm2,ymm8,0x10
+	vpternlogd	ymm11,ymm12,ymm13,0x96
+	vpclmulqdq	ymm13,ymm10,ymm5,0x01
+	vpclmulqdq	ymm12,ymm3,ymm9,0x10
+	vpxord	ymm11,ymm11,ymm12
+	vpshufd	ymm5,ymm5,0x4e
+	vpclmulqdq	ymm0,ymm0,ymm6,0x11
+	vpclmulqdq	ymm1,ymm1,ymm7,0x11
+	vpclmulqdq	ymm2,ymm2,ymm8,0x11
+	vpternlogd	ymm11,ymm5,ymm13,0x96
+	vpclmulqdq	ymm3,ymm3,ymm9,0x11
+	vpternlogd	ymm0,ymm1,ymm2,0x96
+	vpclmulqdq	ymm12,ymm10,ymm11,0x01
+	vpxord	ymm5,ymm0,ymm3
+	vpshufd	ymm11,ymm11,0x4e
+	vpternlogd	ymm5,ymm11,ymm12,0x96
+	vextracti32x4	xmm0,ymm5,1
+	vpxord	xmm5,xmm5,xmm0
+
+	sub	r8,-4*32
+	add	r9,-4*32
+	cmp	r9,4*32-1
+	ja	NEAR $L$aad_loop_4x__func1
+
+
+	cmp	r9,32
+	jb	NEAR $L$aad_large_done__func1
+$L$aad_loop_1x__func1:
+	vmovdqu8	ymm0,YMMWORD[r8]
+	vpshufb	ymm0,ymm0,ymm4
+	vpxord	ymm5,ymm5,ymm0
+	vpclmulqdq	ymm0,ymm5,ymm9,0x00
+	vpclmulqdq	ymm1,ymm5,ymm9,0x01
+	vpclmulqdq	ymm2,ymm5,ymm9,0x10
+	vpxord	ymm1,ymm1,ymm2
+	vpclmulqdq	ymm2,ymm10,ymm0,0x01
+	vpshufd	ymm0,ymm0,0x4e
+	vpternlogd	ymm1,ymm0,ymm2,0x96
+	vpclmulqdq	ymm5,ymm5,ymm9,0x11
+	vpclmulqdq	ymm0,ymm10,ymm1,0x01
+	vpshufd	ymm1,ymm1,0x4e
+	vpternlogd	ymm5,ymm1,ymm0,0x96
+
+	vextracti32x4	xmm0,ymm5,1
+	vpxord	xmm5,xmm5,xmm0
+
+	add	r8,32
+	sub	r9,32
+	cmp	r9,32
+	jae	NEAR $L$aad_loop_1x__func1
+
+$L$aad_large_done__func1:
+
+
+	vzeroupper
+
+
+$L$aad_blockbyblock__func1:
+	test	r9,r9
+	jz	NEAR $L$aad_done__func1
+	vmovdqu	xmm9,XMMWORD[((256-16))+rdx]
+$L$aad_loop_blockbyblock__func1:
+	vmovdqu	xmm0,XMMWORD[r8]
+	vpshufb	xmm0,xmm0,xmm4
+	vpxor	xmm5,xmm5,xmm0
+	vpclmulqdq	xmm0,xmm5,xmm9,0x00
+	vpclmulqdq	xmm1,xmm5,xmm9,0x01
+	vpclmulqdq	xmm2,xmm5,xmm9,0x10
+	vpxord	xmm1,xmm1,xmm2
+	vpclmulqdq	xmm2,xmm10,xmm0,0x01
+	vpshufd	xmm0,xmm0,0x4e
+	vpternlogd	xmm1,xmm0,xmm2,0x96
+	vpclmulqdq	xmm5,xmm5,xmm9,0x11
+	vpclmulqdq	xmm0,xmm10,xmm1,0x01
+	vpshufd	xmm1,xmm1,0x4e
+	vpternlogd	xmm5,xmm1,xmm0,0x96
+
+	add	r8,16
+	sub	r9,16
+	jnz	NEAR $L$aad_loop_blockbyblock__func1
+
+$L$aad_done__func1:
+
+	vpshufb	xmm5,xmm5,xmm4
+	vmovdqu	XMMWORD[rcx],xmm5
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	movdqa	xmm10,XMMWORD[64+rsp]
+	movdqa	xmm11,XMMWORD[80+rsp]
+	movdqa	xmm12,XMMWORD[96+rsp]
+	movdqa	xmm13,XMMWORD[112+rsp]
+	add	rsp,136
+	ret
+$L$SEH_end_gcm_ghash_vpclmulqdq_avx10_256_12:
+
+
+global	aes_gcm_enc_update_vaes_avx10_256
+
+ALIGN	32
+aes_gcm_enc_update_vaes_avx10_256:
+
+$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1:
+_CET_ENDBR
+	push	rsi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_2:
+	push	rdi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_3:
+	push	r12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_4:
+
+	mov	rsi,QWORD[64+rsp]
+	mov	rdi,QWORD[72+rsp]
+	mov	r12,QWORD[80+rsp]
+	sub	rsp,160
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_5:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_6:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_7:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_8:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_9:
+	movdqa	XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_10:
+	movdqa	XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_11:
+	movdqa	XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_12:
+	movdqa	XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_13:
+	movdqa	XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_14:
+	movdqa	XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_15:
+
+$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx10_256_16:
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN	BORINGSSL_function_hit
+	mov	BYTE[((BORINGSSL_function_hit+6))],1
+%endif
+
+	vbroadcasti32x4	ymm8,YMMWORD[$L$bswap_mask]
+	vbroadcasti32x4	ymm31,YMMWORD[$L$gfpoly]
+
+
+
+	vmovdqu	xmm10,XMMWORD[r12]
+	vpshufb	xmm10,xmm10,xmm8
+	vbroadcasti32x4	ymm12,YMMWORD[rsi]
+	vpshufb	ymm12,ymm12,ymm8
+
+
+
+	mov	r10d,DWORD[240+r9]
+	lea	r10d,[((-20))+r10*4]
+
+
+
+
+	lea	r11,[96+r10*4+r9]
+	vbroadcasti32x4	ymm13,YMMWORD[r9]
+	vbroadcasti32x4	ymm14,YMMWORD[r11]
+
+
+	vpaddd	ymm12,ymm12,YMMWORD[$L$ctr_pattern]
+
+
+	vbroadcasti32x4	ymm11,YMMWORD[$L$inc_2blocks]
+
+
+
+	cmp	r8,4*32-1
+	jbe	NEAR $L$crypt_loop_4x_done__func1
+
+
+	vmovdqu8	ymm27,YMMWORD[((256-128))+rdi]
+	vmovdqu8	ymm28,YMMWORD[((256-96))+rdi]
+	vmovdqu8	ymm29,YMMWORD[((256-64))+rdi]
+	vmovdqu8	ymm30,YMMWORD[((256-32))+rdi]
+
+
+
+
+	vpshufb	ymm0,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+	vpshufb	ymm1,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+	vpshufb	ymm2,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+	vpshufb	ymm3,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+
+
+	vpxord	ymm0,ymm0,ymm13
+	vpxord	ymm1,ymm1,ymm13
+	vpxord	ymm2,ymm2,ymm13
+	vpxord	ymm3,ymm3,ymm13
+
+	lea	rax,[16+r9]
+$L$vaesenc_loop_first_4_vecs__func1:
+	vbroadcasti32x4	ymm9,YMMWORD[rax]
+	vaesenc	ymm0,ymm0,ymm9
+	vaesenc	ymm1,ymm1,ymm9
+	vaesenc	ymm2,ymm2,ymm9
+	vaesenc	ymm3,ymm3,ymm9
+
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_first_4_vecs__func1
+
+
+
+	vpxord	ymm4,ymm14,YMMWORD[rcx]
+	vpxord	ymm5,ymm14,YMMWORD[32+rcx]
+	vpxord	ymm6,ymm14,YMMWORD[64+rcx]
+	vpxord	ymm7,ymm14,YMMWORD[96+rcx]
+
+
+
+	vaesenclast	ymm4,ymm0,ymm4
+	vaesenclast	ymm5,ymm1,ymm5
+	vaesenclast	ymm6,ymm2,ymm6
+	vaesenclast	ymm7,ymm3,ymm7
+
+
+	vmovdqu8	YMMWORD[rdx],ymm4
+	vmovdqu8	YMMWORD[32+rdx],ymm5
+	vmovdqu8	YMMWORD[64+rdx],ymm6
+	vmovdqu8	YMMWORD[96+rdx],ymm7
+
+	sub	rcx,-4*32
+	sub	rdx,-4*32
+	add	r8,-4*32
+	cmp	r8,4*32-1
+	jbe	NEAR $L$ghash_last_ciphertext_4x__func1
+	vbroadcasti32x4	ymm15,YMMWORD[((-144))+r11]
+	vbroadcasti32x4	ymm16,YMMWORD[((-128))+r11]
+	vbroadcasti32x4	ymm17,YMMWORD[((-112))+r11]
+	vbroadcasti32x4	ymm18,YMMWORD[((-96))+r11]
+	vbroadcasti32x4	ymm19,YMMWORD[((-80))+r11]
+	vbroadcasti32x4	ymm20,YMMWORD[((-64))+r11]
+	vbroadcasti32x4	ymm21,YMMWORD[((-48))+r11]
+	vbroadcasti32x4	ymm22,YMMWORD[((-32))+r11]
+	vbroadcasti32x4	ymm23,YMMWORD[((-16))+r11]
+$L$crypt_loop_4x__func1:
+
+
+
+	vpshufb	ymm0,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+	vpshufb	ymm1,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+	vpshufb	ymm2,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+	vpshufb	ymm3,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+
+
+	vpxord	ymm0,ymm0,ymm13
+	vpxord	ymm1,ymm1,ymm13
+	vpxord	ymm2,ymm2,ymm13
+	vpxord	ymm3,ymm3,ymm13
+
+	cmp	r10d,24
+	jl	NEAR $L$aes128__func1
+	je	NEAR $L$aes192__func1
+
+	vbroadcasti32x4	ymm9,YMMWORD[((-208))+r11]
+	vaesenc	ymm0,ymm0,ymm9
+	vaesenc	ymm1,ymm1,ymm9
+	vaesenc	ymm2,ymm2,ymm9
+	vaesenc	ymm3,ymm3,ymm9
+
+	vbroadcasti32x4	ymm9,YMMWORD[((-192))+r11]
+	vaesenc	ymm0,ymm0,ymm9
+	vaesenc	ymm1,ymm1,ymm9
+	vaesenc	ymm2,ymm2,ymm9
+	vaesenc	ymm3,ymm3,ymm9
+
+$L$aes192__func1:
+	vbroadcasti32x4	ymm9,YMMWORD[((-176))+r11]
+	vaesenc	ymm0,ymm0,ymm9
+	vaesenc	ymm1,ymm1,ymm9
+	vaesenc	ymm2,ymm2,ymm9
+	vaesenc	ymm3,ymm3,ymm9
+
+	vbroadcasti32x4	ymm9,YMMWORD[((-160))+r11]
+	vaesenc	ymm0,ymm0,ymm9
+	vaesenc	ymm1,ymm1,ymm9
+	vaesenc	ymm2,ymm2,ymm9
+	vaesenc	ymm3,ymm3,ymm9
+
+$L$aes128__func1:
+	vpshufb	ymm4,ymm4,ymm8
+	vpxord	ymm4,ymm4,ymm10
+	vpshufb	ymm5,ymm5,ymm8
+	vpshufb	ymm6,ymm6,ymm8
+
+	vaesenc	ymm0,ymm0,ymm15
+	vaesenc	ymm1,ymm1,ymm15
+	vaesenc	ymm2,ymm2,ymm15
+	vaesenc	ymm3,ymm3,ymm15
+
+	vpshufb	ymm7,ymm7,ymm8
+	vpclmulqdq	ymm10,ymm4,ymm27,0x00
+	vpclmulqdq	ymm24,ymm5,ymm28,0x00
+	vpclmulqdq	ymm25,ymm6,ymm29,0x00
+
+	vaesenc	ymm0,ymm0,ymm16
+	vaesenc	ymm1,ymm1,ymm16
+	vaesenc	ymm2,ymm2,ymm16
+	vaesenc	ymm3,ymm3,ymm16
+
+	vpxord	ymm10,ymm10,ymm24
+	vpclmulqdq	ymm26,ymm7,ymm30,0x00
+	vpternlogd	ymm10,ymm25,ymm26,0x96
+	vpclmulqdq	ymm24,ymm4,ymm27,0x01
+
+	vaesenc	ymm0,ymm0,ymm17
+	vaesenc	ymm1,ymm1,ymm17
+	vaesenc	ymm2,ymm2,ymm17
+	vaesenc	ymm3,ymm3,ymm17
+
+	vpclmulqdq	ymm25,ymm5,ymm28,0x01
+	vpclmulqdq	ymm26,ymm6,ymm29,0x01
+	vpternlogd	ymm24,ymm25,ymm26,0x96
+	vpclmulqdq	ymm25,ymm7,ymm30,0x01
+
+	vaesenc	ymm0,ymm0,ymm18
+	vaesenc	ymm1,ymm1,ymm18
+	vaesenc	ymm2,ymm2,ymm18
+	vaesenc	ymm3,ymm3,ymm18
+
+	vpclmulqdq	ymm26,ymm4,ymm27,0x10
+	vpternlogd	ymm24,ymm25,ymm26,0x96
+	vpclmulqdq	ymm25,ymm5,ymm28,0x10
+	vpclmulqdq	ymm26,ymm6,ymm29,0x10
+
+	vaesenc	ymm0,ymm0,ymm19
+	vaesenc	ymm1,ymm1,ymm19
+	vaesenc	ymm2,ymm2,ymm19
+	vaesenc	ymm3,ymm3,ymm19
+
+	vpternlogd	ymm24,ymm25,ymm26,0x96
+	vpclmulqdq	ymm26,ymm31,ymm10,0x01
+	vpclmulqdq	ymm25,ymm7,ymm30,0x10
+	vpxord	ymm24,ymm24,ymm25
+
+	vaesenc	ymm0,ymm0,ymm20
+	vaesenc	ymm1,ymm1,ymm20
+	vaesenc	ymm2,ymm2,ymm20
+	vaesenc	ymm3,ymm3,ymm20
+
+	vpshufd	ymm10,ymm10,0x4e
+	vpclmulqdq	ymm4,ymm4,ymm27,0x11
+	vpclmulqdq	ymm5,ymm5,ymm28,0x11
+	vpclmulqdq	ymm6,ymm6,ymm29,0x11
+
+	vaesenc	ymm0,ymm0,ymm21
+	vaesenc	ymm1,ymm1,ymm21
+	vaesenc	ymm2,ymm2,ymm21
+	vaesenc	ymm3,ymm3,ymm21
+
+	vpternlogd	ymm24,ymm10,ymm26,0x96
+	vpclmulqdq	ymm7,ymm7,ymm30,0x11
+	vpternlogd	ymm4,ymm5,ymm6,0x96
+	vpclmulqdq	ymm25,ymm31,ymm24,0x01
+
+	vaesenc	ymm0,ymm0,ymm22
+	vaesenc	ymm1,ymm1,ymm22
+	vaesenc	ymm2,ymm2,ymm22
+	vaesenc	ymm3,ymm3,ymm22
+
+	vpxord	ymm10,ymm4,ymm7
+	vpshufd	ymm24,ymm24,0x4e
+	vpternlogd	ymm10,ymm24,ymm25,0x96
+
+	vaesenc	ymm0,ymm0,ymm23
+	vaesenc	ymm1,ymm1,ymm23
+	vaesenc	ymm2,ymm2,ymm23
+	vaesenc	ymm3,ymm3,ymm23
+
+	vextracti32x4	xmm4,ymm10,1
+	vpxord	xmm10,xmm10,xmm4
+
+
+
+
+	vpxord	ymm4,ymm14,YMMWORD[rcx]
+	vpxord	ymm5,ymm14,YMMWORD[32+rcx]
+	vpxord	ymm6,ymm14,YMMWORD[64+rcx]
+	vpxord	ymm7,ymm14,YMMWORD[96+rcx]
+
+
+
+	vaesenclast	ymm4,ymm0,ymm4
+	vaesenclast	ymm5,ymm1,ymm5
+	vaesenclast	ymm6,ymm2,ymm6
+	vaesenclast	ymm7,ymm3,ymm7
+
+
+	vmovdqu8	YMMWORD[rdx],ymm4
+	vmovdqu8	YMMWORD[32+rdx],ymm5
+	vmovdqu8	YMMWORD[64+rdx],ymm6
+	vmovdqu8	YMMWORD[96+rdx],ymm7
+
+	sub	rcx,-4*32
+	sub	rdx,-4*32
+	add	r8,-4*32
+	cmp	r8,4*32-1
+	ja	NEAR $L$crypt_loop_4x__func1
+$L$ghash_last_ciphertext_4x__func1:
+	vpshufb	ymm4,ymm4,ymm8
+	vpxord	ymm4,ymm4,ymm10
+	vpshufb	ymm5,ymm5,ymm8
+	vpshufb	ymm6,ymm6,ymm8
+	vpshufb	ymm7,ymm7,ymm8
+	vpclmulqdq	ymm10,ymm4,ymm27,0x00
+	vpclmulqdq	ymm24,ymm5,ymm28,0x00
+	vpclmulqdq	ymm25,ymm6,ymm29,0x00
+	vpxord	ymm10,ymm10,ymm24
+	vpclmulqdq	ymm26,ymm7,ymm30,0x00
+	vpternlogd	ymm10,ymm25,ymm26,0x96
+	vpclmulqdq	ymm24,ymm4,ymm27,0x01
+	vpclmulqdq	ymm25,ymm5,ymm28,0x01
+	vpclmulqdq	ymm26,ymm6,ymm29,0x01
+	vpternlogd	ymm24,ymm25,ymm26,0x96
+	vpclmulqdq	ymm25,ymm7,ymm30,0x01
+	vpclmulqdq	ymm26,ymm4,ymm27,0x10
+	vpternlogd	ymm24,ymm25,ymm26,0x96
+	vpclmulqdq	ymm25,ymm5,ymm28,0x10
+	vpclmulqdq	ymm26,ymm6,ymm29,0x10
+	vpternlogd	ymm24,ymm25,ymm26,0x96
+	vpclmulqdq	ymm26,ymm31,ymm10,0x01
+	vpclmulqdq	ymm25,ymm7,ymm30,0x10
+	vpxord	ymm24,ymm24,ymm25
+	vpshufd	ymm10,ymm10,0x4e
+	vpclmulqdq	ymm4,ymm4,ymm27,0x11
+	vpclmulqdq	ymm5,ymm5,ymm28,0x11
+	vpclmulqdq	ymm6,ymm6,ymm29,0x11
+	vpternlogd	ymm24,ymm10,ymm26,0x96
+	vpclmulqdq	ymm7,ymm7,ymm30,0x11
+	vpternlogd	ymm4,ymm5,ymm6,0x96
+	vpclmulqdq	ymm25,ymm31,ymm24,0x01
+	vpxord	ymm10,ymm4,ymm7
+	vpshufd	ymm24,ymm24,0x4e
+	vpternlogd	ymm10,ymm24,ymm25,0x96
+	vextracti32x4	xmm4,ymm10,1
+	vpxord	xmm10,xmm10,xmm4
+
+$L$crypt_loop_4x_done__func1:
+
+	test	r8,r8
+	jz	NEAR $L$done__func1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	rax,r8
+	neg	rax
+	and	rax,-16
+	lea	rsi,[256+rax*1+rdi]
+	vpxor	xmm4,xmm4,xmm4
+	vpxor	xmm5,xmm5,xmm5
+	vpxor	xmm6,xmm6,xmm6
+
+	cmp	r8,32
+	jb	NEAR $L$partial_vec__func1
+
+$L$crypt_loop_1x__func1:
+
+
+
+	vpshufb	ymm0,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+	vpxord	ymm0,ymm0,ymm13
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_full_vec__func1:
+	vbroadcasti32x4	ymm9,YMMWORD[rax]
+	vaesenc	ymm0,ymm0,ymm9
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_full_vec__func1
+	vaesenclast	ymm0,ymm0,ymm14
+
+
+	vmovdqu8	ymm1,YMMWORD[rcx]
+	vpxord	ymm0,ymm0,ymm1
+	vmovdqu8	YMMWORD[rdx],ymm0
+
+
+	vmovdqu8	ymm30,YMMWORD[rsi]
+	vpshufb	ymm0,ymm0,ymm8
+	vpxord	ymm0,ymm0,ymm10
+	vpclmulqdq	ymm7,ymm0,ymm30,0x00
+	vpclmulqdq	ymm1,ymm0,ymm30,0x01
+	vpclmulqdq	ymm2,ymm0,ymm30,0x10
+	vpclmulqdq	ymm3,ymm0,ymm30,0x11
+	vpxord	ymm4,ymm4,ymm7
+	vpternlogd	ymm5,ymm1,ymm2,0x96
+	vpxord	ymm6,ymm6,ymm3
+
+	vpxor	xmm10,xmm10,xmm10
+
+	add	rsi,32
+	add	rcx,32
+	add	rdx,32
+	sub	r8,32
+	cmp	r8,32
+	jae	NEAR $L$crypt_loop_1x__func1
+
+	test	r8,r8
+	jz	NEAR $L$reduce__func1
+
+$L$partial_vec__func1:
+
+
+
+
+	mov	rax,-1
+	bzhi	rax,rax,r8
+	kmovd	k1,eax
+	add	r8,15
+	and	r8,-16
+	mov	rax,-1
+	bzhi	rax,rax,r8
+	kmovd	k2,eax
+
+
+
+	vpshufb	ymm0,ymm12,ymm8
+	vpxord	ymm0,ymm0,ymm13
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_partialvec__func1:
+	vbroadcasti32x4	ymm9,YMMWORD[rax]
+	vaesenc	ymm0,ymm0,ymm9
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_partialvec__func1
+	vaesenclast	ymm0,ymm0,ymm14
+
+
+	vmovdqu8	ymm1{k1}{z},[rcx]
+	vpxord	ymm0,ymm0,ymm1
+	vmovdqu8	YMMWORD[rdx]{k1},ymm0
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	ymm30{k2}{z},[rsi]
+	vmovdqu8	ymm1{k1}{z},ymm0
+	vpshufb	ymm0,ymm1,ymm8
+	vpxord	ymm0,ymm0,ymm10
+	vpclmulqdq	ymm7,ymm0,ymm30,0x00
+	vpclmulqdq	ymm1,ymm0,ymm30,0x01
+	vpclmulqdq	ymm2,ymm0,ymm30,0x10
+	vpclmulqdq	ymm3,ymm0,ymm30,0x11
+	vpxord	ymm4,ymm4,ymm7
+	vpternlogd	ymm5,ymm1,ymm2,0x96
+	vpxord	ymm6,ymm6,ymm3
+
+
+$L$reduce__func1:
+
+	vpclmulqdq	ymm0,ymm31,ymm4,0x01
+	vpshufd	ymm4,ymm4,0x4e
+	vpternlogd	ymm5,ymm4,ymm0,0x96
+	vpclmulqdq	ymm0,ymm31,ymm5,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpternlogd	ymm6,ymm5,ymm0,0x96
+
+	vextracti32x4	xmm0,ymm6,1
+	vpxord	xmm10,xmm6,xmm0
+
+
+$L$done__func1:
+
+	vpshufb	xmm10,xmm10,xmm8
+	vmovdqu	XMMWORD[r12],xmm10
+
+	vzeroupper
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	movdqa	xmm10,XMMWORD[64+rsp]
+	movdqa	xmm11,XMMWORD[80+rsp]
+	movdqa	xmm12,XMMWORD[96+rsp]
+	movdqa	xmm13,XMMWORD[112+rsp]
+	movdqa	xmm14,XMMWORD[128+rsp]
+	movdqa	xmm15,XMMWORD[144+rsp]
+	add	rsp,160
+	pop	r12
+	pop	rdi
+	pop	rsi
+	ret
+$L$SEH_end_aes_gcm_enc_update_vaes_avx10_256_17:
+
+
+global	aes_gcm_dec_update_vaes_avx10_256
+
+ALIGN	32
+aes_gcm_dec_update_vaes_avx10_256:
+
+$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1:
+_CET_ENDBR
+	push	rsi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_2:
+	push	rdi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_3:
+	push	r12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_4:
+
+	mov	rsi,QWORD[64+rsp]
+	mov	rdi,QWORD[72+rsp]
+	mov	r12,QWORD[80+rsp]
+	sub	rsp,160
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_5:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_6:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_7:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_8:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_9:
+	movdqa	XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_10:
+	movdqa	XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_11:
+	movdqa	XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_12:
+	movdqa	XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_13:
+	movdqa	XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_14:
+	movdqa	XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_15:
+
+$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx10_256_16:
+
+	vbroadcasti32x4	ymm8,YMMWORD[$L$bswap_mask]
+	vbroadcasti32x4	ymm31,YMMWORD[$L$gfpoly]
+
+
+
+	vmovdqu	xmm10,XMMWORD[r12]
+	vpshufb	xmm10,xmm10,xmm8
+	vbroadcasti32x4	ymm12,YMMWORD[rsi]
+	vpshufb	ymm12,ymm12,ymm8
+
+
+
+	mov	r10d,DWORD[240+r9]
+	lea	r10d,[((-20))+r10*4]
+
+
+
+
+	lea	r11,[96+r10*4+r9]
+	vbroadcasti32x4	ymm13,YMMWORD[r9]
+	vbroadcasti32x4	ymm14,YMMWORD[r11]
+
+
+	vpaddd	ymm12,ymm12,YMMWORD[$L$ctr_pattern]
+
+
+	vbroadcasti32x4	ymm11,YMMWORD[$L$inc_2blocks]
+
+
+
+	cmp	r8,4*32-1
+	jbe	NEAR $L$crypt_loop_4x_done__func2
+
+
+	vmovdqu8	ymm27,YMMWORD[((256-128))+rdi]
+	vmovdqu8	ymm28,YMMWORD[((256-96))+rdi]
+	vmovdqu8	ymm29,YMMWORD[((256-64))+rdi]
+	vmovdqu8	ymm30,YMMWORD[((256-32))+rdi]
+	vbroadcasti32x4	ymm15,YMMWORD[((-144))+r11]
+	vbroadcasti32x4	ymm16,YMMWORD[((-128))+r11]
+	vbroadcasti32x4	ymm17,YMMWORD[((-112))+r11]
+	vbroadcasti32x4	ymm18,YMMWORD[((-96))+r11]
+	vbroadcasti32x4	ymm19,YMMWORD[((-80))+r11]
+	vbroadcasti32x4	ymm20,YMMWORD[((-64))+r11]
+	vbroadcasti32x4	ymm21,YMMWORD[((-48))+r11]
+	vbroadcasti32x4	ymm22,YMMWORD[((-32))+r11]
+	vbroadcasti32x4	ymm23,YMMWORD[((-16))+r11]
+$L$crypt_loop_4x__func2:
+	vmovdqu8	ymm4,YMMWORD[rcx]
+	vmovdqu8	ymm5,YMMWORD[32+rcx]
+	vmovdqu8	ymm6,YMMWORD[64+rcx]
+	vmovdqu8	ymm7,YMMWORD[96+rcx]
+
+
+
+	vpshufb	ymm0,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+	vpshufb	ymm1,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+	vpshufb	ymm2,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+	vpshufb	ymm3,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+
+
+	vpxord	ymm0,ymm0,ymm13
+	vpxord	ymm1,ymm1,ymm13
+	vpxord	ymm2,ymm2,ymm13
+	vpxord	ymm3,ymm3,ymm13
+
+	cmp	r10d,24
+	jl	NEAR $L$aes128__func2
+	je	NEAR $L$aes192__func2
+
+	vbroadcasti32x4	ymm9,YMMWORD[((-208))+r11]
+	vaesenc	ymm0,ymm0,ymm9
+	vaesenc	ymm1,ymm1,ymm9
+	vaesenc	ymm2,ymm2,ymm9
+	vaesenc	ymm3,ymm3,ymm9
+
+	vbroadcasti32x4	ymm9,YMMWORD[((-192))+r11]
+	vaesenc	ymm0,ymm0,ymm9
+	vaesenc	ymm1,ymm1,ymm9
+	vaesenc	ymm2,ymm2,ymm9
+	vaesenc	ymm3,ymm3,ymm9
+
+$L$aes192__func2:
+	vbroadcasti32x4	ymm9,YMMWORD[((-176))+r11]
+	vaesenc	ymm0,ymm0,ymm9
+	vaesenc	ymm1,ymm1,ymm9
+	vaesenc	ymm2,ymm2,ymm9
+	vaesenc	ymm3,ymm3,ymm9
+
+	vbroadcasti32x4	ymm9,YMMWORD[((-160))+r11]
+	vaesenc	ymm0,ymm0,ymm9
+	vaesenc	ymm1,ymm1,ymm9
+	vaesenc	ymm2,ymm2,ymm9
+	vaesenc	ymm3,ymm3,ymm9
+
+$L$aes128__func2:
+	vpshufb	ymm4,ymm4,ymm8
+	vpxord	ymm4,ymm4,ymm10
+	vpshufb	ymm5,ymm5,ymm8
+	vpshufb	ymm6,ymm6,ymm8
+
+	vaesenc	ymm0,ymm0,ymm15
+	vaesenc	ymm1,ymm1,ymm15
+	vaesenc	ymm2,ymm2,ymm15
+	vaesenc	ymm3,ymm3,ymm15
+
+	vpshufb	ymm7,ymm7,ymm8
+	vpclmulqdq	ymm10,ymm4,ymm27,0x00
+	vpclmulqdq	ymm24,ymm5,ymm28,0x00
+	vpclmulqdq	ymm25,ymm6,ymm29,0x00
+
+	vaesenc	ymm0,ymm0,ymm16
+	vaesenc	ymm1,ymm1,ymm16
+	vaesenc	ymm2,ymm2,ymm16
+	vaesenc	ymm3,ymm3,ymm16
+
+	vpxord	ymm10,ymm10,ymm24
+	vpclmulqdq	ymm26,ymm7,ymm30,0x00
+	vpternlogd	ymm10,ymm25,ymm26,0x96
+	vpclmulqdq	ymm24,ymm4,ymm27,0x01
+
+	vaesenc	ymm0,ymm0,ymm17
+	vaesenc	ymm1,ymm1,ymm17
+	vaesenc	ymm2,ymm2,ymm17
+	vaesenc	ymm3,ymm3,ymm17
+
+	vpclmulqdq	ymm25,ymm5,ymm28,0x01
+	vpclmulqdq	ymm26,ymm6,ymm29,0x01
+	vpternlogd	ymm24,ymm25,ymm26,0x96
+	vpclmulqdq	ymm25,ymm7,ymm30,0x01
+
+	vaesenc	ymm0,ymm0,ymm18
+	vaesenc	ymm1,ymm1,ymm18
+	vaesenc	ymm2,ymm2,ymm18
+	vaesenc	ymm3,ymm3,ymm18
+
+	vpclmulqdq	ymm26,ymm4,ymm27,0x10
+	vpternlogd	ymm24,ymm25,ymm26,0x96
+	vpclmulqdq	ymm25,ymm5,ymm28,0x10
+	vpclmulqdq	ymm26,ymm6,ymm29,0x10
+
+	vaesenc	ymm0,ymm0,ymm19
+	vaesenc	ymm1,ymm1,ymm19
+	vaesenc	ymm2,ymm2,ymm19
+	vaesenc	ymm3,ymm3,ymm19
+
+	vpternlogd	ymm24,ymm25,ymm26,0x96
+	vpclmulqdq	ymm26,ymm31,ymm10,0x01
+	vpclmulqdq	ymm25,ymm7,ymm30,0x10
+	vpxord	ymm24,ymm24,ymm25
+
+	vaesenc	ymm0,ymm0,ymm20
+	vaesenc	ymm1,ymm1,ymm20
+	vaesenc	ymm2,ymm2,ymm20
+	vaesenc	ymm3,ymm3,ymm20
+
+	vpshufd	ymm10,ymm10,0x4e
+	vpclmulqdq	ymm4,ymm4,ymm27,0x11
+	vpclmulqdq	ymm5,ymm5,ymm28,0x11
+	vpclmulqdq	ymm6,ymm6,ymm29,0x11
+
+	vaesenc	ymm0,ymm0,ymm21
+	vaesenc	ymm1,ymm1,ymm21
+	vaesenc	ymm2,ymm2,ymm21
+	vaesenc	ymm3,ymm3,ymm21
+
+	vpternlogd	ymm24,ymm10,ymm26,0x96
+	vpclmulqdq	ymm7,ymm7,ymm30,0x11
+	vpternlogd	ymm4,ymm5,ymm6,0x96
+	vpclmulqdq	ymm25,ymm31,ymm24,0x01
+
+	vaesenc	ymm0,ymm0,ymm22
+	vaesenc	ymm1,ymm1,ymm22
+	vaesenc	ymm2,ymm2,ymm22
+	vaesenc	ymm3,ymm3,ymm22
+
+	vpxord	ymm10,ymm4,ymm7
+	vpshufd	ymm24,ymm24,0x4e
+	vpternlogd	ymm10,ymm24,ymm25,0x96
+
+	vaesenc	ymm0,ymm0,ymm23
+	vaesenc	ymm1,ymm1,ymm23
+	vaesenc	ymm2,ymm2,ymm23
+	vaesenc	ymm3,ymm3,ymm23
+
+	vextracti32x4	xmm4,ymm10,1
+	vpxord	xmm10,xmm10,xmm4
+
+
+
+
+	vpxord	ymm4,ymm14,YMMWORD[rcx]
+	vpxord	ymm5,ymm14,YMMWORD[32+rcx]
+	vpxord	ymm6,ymm14,YMMWORD[64+rcx]
+	vpxord	ymm7,ymm14,YMMWORD[96+rcx]
+
+
+
+	vaesenclast	ymm4,ymm0,ymm4
+	vaesenclast	ymm5,ymm1,ymm5
+	vaesenclast	ymm6,ymm2,ymm6
+	vaesenclast	ymm7,ymm3,ymm7
+
+
+	vmovdqu8	YMMWORD[rdx],ymm4
+	vmovdqu8	YMMWORD[32+rdx],ymm5
+	vmovdqu8	YMMWORD[64+rdx],ymm6
+	vmovdqu8	YMMWORD[96+rdx],ymm7
+
+	sub	rcx,-4*32
+	sub	rdx,-4*32
+	add	r8,-4*32
+	cmp	r8,4*32-1
+	ja	NEAR $L$crypt_loop_4x__func2
+$L$crypt_loop_4x_done__func2:
+
+	test	r8,r8
+	jz	NEAR $L$done__func2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	rax,r8
+	neg	rax
+	and	rax,-16
+	lea	rsi,[256+rax*1+rdi]
+	vpxor	xmm4,xmm4,xmm4
+	vpxor	xmm5,xmm5,xmm5
+	vpxor	xmm6,xmm6,xmm6
+
+	cmp	r8,32
+	jb	NEAR $L$partial_vec__func2
+
+$L$crypt_loop_1x__func2:
+
+
+
+	vpshufb	ymm0,ymm12,ymm8
+	vpaddd	ymm12,ymm12,ymm11
+	vpxord	ymm0,ymm0,ymm13
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_full_vec__func2:
+	vbroadcasti32x4	ymm9,YMMWORD[rax]
+	vaesenc	ymm0,ymm0,ymm9
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_full_vec__func2
+	vaesenclast	ymm0,ymm0,ymm14
+
+
+	vmovdqu8	ymm1,YMMWORD[rcx]
+	vpxord	ymm0,ymm0,ymm1
+	vmovdqu8	YMMWORD[rdx],ymm0
+
+
+	vmovdqu8	ymm30,YMMWORD[rsi]
+	vpshufb	ymm0,ymm1,ymm8
+	vpxord	ymm0,ymm0,ymm10
+	vpclmulqdq	ymm7,ymm0,ymm30,0x00
+	vpclmulqdq	ymm1,ymm0,ymm30,0x01
+	vpclmulqdq	ymm2,ymm0,ymm30,0x10
+	vpclmulqdq	ymm3,ymm0,ymm30,0x11
+	vpxord	ymm4,ymm4,ymm7
+	vpternlogd	ymm5,ymm1,ymm2,0x96
+	vpxord	ymm6,ymm6,ymm3
+
+	vpxor	xmm10,xmm10,xmm10
+
+	add	rsi,32
+	add	rcx,32
+	add	rdx,32
+	sub	r8,32
+	cmp	r8,32
+	jae	NEAR $L$crypt_loop_1x__func2
+
+	test	r8,r8
+	jz	NEAR $L$reduce__func2
+
+$L$partial_vec__func2:
+
+
+
+
+	mov	rax,-1
+	bzhi	rax,rax,r8
+	kmovd	k1,eax
+	add	r8,15
+	and	r8,-16
+	mov	rax,-1
+	bzhi	rax,rax,r8
+	kmovd	k2,eax
+
+
+
+	vpshufb	ymm0,ymm12,ymm8
+	vpxord	ymm0,ymm0,ymm13
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_partialvec__func2:
+	vbroadcasti32x4	ymm9,YMMWORD[rax]
+	vaesenc	ymm0,ymm0,ymm9
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_partialvec__func2
+	vaesenclast	ymm0,ymm0,ymm14
+
+
+	vmovdqu8	ymm1{k1}{z},[rcx]
+	vpxord	ymm0,ymm0,ymm1
+	vmovdqu8	YMMWORD[rdx]{k1},ymm0
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	ymm30{k2}{z},[rsi]
+
+	vpshufb	ymm0,ymm1,ymm8
+	vpxord	ymm0,ymm0,ymm10
+	vpclmulqdq	ymm7,ymm0,ymm30,0x00
+	vpclmulqdq	ymm1,ymm0,ymm30,0x01
+	vpclmulqdq	ymm2,ymm0,ymm30,0x10
+	vpclmulqdq	ymm3,ymm0,ymm30,0x11
+	vpxord	ymm4,ymm4,ymm7
+	vpternlogd	ymm5,ymm1,ymm2,0x96
+	vpxord	ymm6,ymm6,ymm3
+
+
+$L$reduce__func2:
+
+	vpclmulqdq	ymm0,ymm31,ymm4,0x01
+	vpshufd	ymm4,ymm4,0x4e
+	vpternlogd	ymm5,ymm4,ymm0,0x96
+	vpclmulqdq	ymm0,ymm31,ymm5,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpternlogd	ymm6,ymm5,ymm0,0x96
+
+	vextracti32x4	xmm0,ymm6,1
+	vpxord	xmm10,xmm6,xmm0
+
+
+$L$done__func2:
+
+	vpshufb	xmm10,xmm10,xmm8
+	vmovdqu	XMMWORD[r12],xmm10
+
+	vzeroupper
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	movdqa	xmm10,XMMWORD[64+rsp]
+	movdqa	xmm11,XMMWORD[80+rsp]
+	movdqa	xmm12,XMMWORD[96+rsp]
+	movdqa	xmm13,XMMWORD[112+rsp]
+	movdqa	xmm14,XMMWORD[128+rsp]
+	movdqa	xmm15,XMMWORD[144+rsp]
+	add	rsp,160
+	pop	r12
+	pop	rdi
+	pop	rsi
+	ret
+$L$SEH_end_aes_gcm_dec_update_vaes_avx10_256_17:
+
+
+global	gcm_ghash_vpclmulqdq_avx10_512
+
+ALIGN	32
+gcm_ghash_vpclmulqdq_avx10_512:
+
+$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1:
+_CET_ENDBR
+	sub	rsp,136
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_2:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_3:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_4:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_5:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_6:
+	movdqa	XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_7:
+	movdqa	XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_8:
+	movdqa	XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_9:
+	movdqa	XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_10:
+
+$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx10_512_11:
+
+
+
+
+	vmovdqu	xmm4,XMMWORD[$L$bswap_mask]
+	vmovdqu	xmm10,XMMWORD[$L$gfpoly]
+
+
+	vmovdqu	xmm5,XMMWORD[rcx]
+	vpshufb	xmm5,xmm5,xmm4
+
+
+	cmp	r9,64
+	jb	NEAR $L$aad_blockbyblock__func2
+
+
+
+	vshufi64x2	zmm4,zmm4,zmm4,0
+	vshufi64x2	zmm10,zmm10,zmm10,0
+
+
+	vmovdqu8	zmm9,ZMMWORD[((256-64))+rdx]
+
+	cmp	r9,4*64-1
+	jbe	NEAR $L$aad_loop_1x__func2
+
+
+	vmovdqu8	zmm6,ZMMWORD[((256-256))+rdx]
+	vmovdqu8	zmm7,ZMMWORD[((256-192))+rdx]
+	vmovdqu8	zmm8,ZMMWORD[((256-128))+rdx]
+
+
+$L$aad_loop_4x__func2:
+	vmovdqu8	zmm0,ZMMWORD[r8]
+	vmovdqu8	zmm1,ZMMWORD[64+r8]
+	vmovdqu8	zmm2,ZMMWORD[128+r8]
+	vmovdqu8	zmm3,ZMMWORD[192+r8]
+	vpshufb	zmm0,zmm0,zmm4
+	vpxord	zmm0,zmm0,zmm5
+	vpshufb	zmm1,zmm1,zmm4
+	vpshufb	zmm2,zmm2,zmm4
+	vpshufb	zmm3,zmm3,zmm4
+	vpclmulqdq	zmm5,zmm0,zmm6,0x00
+	vpclmulqdq	zmm11,zmm1,zmm7,0x00
+	vpclmulqdq	zmm12,zmm2,zmm8,0x00
+	vpxord	zmm5,zmm5,zmm11
+	vpclmulqdq	zmm13,zmm3,zmm9,0x00
+	vpternlogd	zmm5,zmm12,zmm13,0x96
+	vpclmulqdq	zmm11,zmm0,zmm6,0x01
+	vpclmulqdq	zmm12,zmm1,zmm7,0x01
+	vpclmulqdq	zmm13,zmm2,zmm8,0x01
+	vpternlogd	zmm11,zmm12,zmm13,0x96
+	vpclmulqdq	zmm12,zmm3,zmm9,0x01
+	vpclmulqdq	zmm13,zmm0,zmm6,0x10
+	vpternlogd	zmm11,zmm12,zmm13,0x96
+	vpclmulqdq	zmm12,zmm1,zmm7,0x10
+	vpclmulqdq	zmm13,zmm2,zmm8,0x10
+	vpternlogd	zmm11,zmm12,zmm13,0x96
+	vpclmulqdq	zmm13,zmm10,zmm5,0x01
+	vpclmulqdq	zmm12,zmm3,zmm9,0x10
+	vpxord	zmm11,zmm11,zmm12
+	vpshufd	zmm5,zmm5,0x4e
+	vpclmulqdq	zmm0,zmm0,zmm6,0x11
+	vpclmulqdq	zmm1,zmm1,zmm7,0x11
+	vpclmulqdq	zmm2,zmm2,zmm8,0x11
+	vpternlogd	zmm11,zmm5,zmm13,0x96
+	vpclmulqdq	zmm3,zmm3,zmm9,0x11
+	vpternlogd	zmm0,zmm1,zmm2,0x96
+	vpclmulqdq	zmm12,zmm10,zmm11,0x01
+	vpxord	zmm5,zmm0,zmm3
+	vpshufd	zmm11,zmm11,0x4e
+	vpternlogd	zmm5,zmm11,zmm12,0x96
+	vextracti32x4	xmm0,zmm5,1
+	vextracti32x4	xmm1,zmm5,2
+	vextracti32x4	xmm2,zmm5,3
+	vpxord	xmm5,xmm5,xmm0
+	vpternlogd	xmm5,xmm2,xmm1,0x96
+
+	sub	r8,-4*64
+	add	r9,-4*64
+	cmp	r9,4*64-1
+	ja	NEAR $L$aad_loop_4x__func2
+
+
+	cmp	r9,64
+	jb	NEAR $L$aad_large_done__func2
+$L$aad_loop_1x__func2:
+	vmovdqu8	zmm0,ZMMWORD[r8]
+	vpshufb	zmm0,zmm0,zmm4
+	vpxord	zmm5,zmm5,zmm0
+	vpclmulqdq	zmm0,zmm5,zmm9,0x00
+	vpclmulqdq	zmm1,zmm5,zmm9,0x01
+	vpclmulqdq	zmm2,zmm5,zmm9,0x10
+	vpxord	zmm1,zmm1,zmm2
+	vpclmulqdq	zmm2,zmm10,zmm0,0x01
+	vpshufd	zmm0,zmm0,0x4e
+	vpternlogd	zmm1,zmm0,zmm2,0x96
+	vpclmulqdq	zmm5,zmm5,zmm9,0x11
+	vpclmulqdq	zmm0,zmm10,zmm1,0x01
+	vpshufd	zmm1,zmm1,0x4e
+	vpternlogd	zmm5,zmm1,zmm0,0x96
+
+	vextracti32x4	xmm0,zmm5,1
+	vextracti32x4	xmm1,zmm5,2
+	vextracti32x4	xmm2,zmm5,3
+	vpxord	xmm5,xmm5,xmm0
+	vpternlogd	xmm5,xmm2,xmm1,0x96
+
+	add	r8,64
+	sub	r9,64
+	cmp	r9,64
+	jae	NEAR $L$aad_loop_1x__func2
+
+$L$aad_large_done__func2:
+
+
+	vzeroupper
+
+
+$L$aad_blockbyblock__func2:
+	test	r9,r9
+	jz	NEAR $L$aad_done__func2
+	vmovdqu	xmm9,XMMWORD[((256-16))+rdx]
+$L$aad_loop_blockbyblock__func2:
+	vmovdqu	xmm0,XMMWORD[r8]
+	vpshufb	xmm0,xmm0,xmm4
+	vpxor	xmm5,xmm5,xmm0
+	vpclmulqdq	xmm0,xmm5,xmm9,0x00
+	vpclmulqdq	xmm1,xmm5,xmm9,0x01
+	vpclmulqdq	xmm2,xmm5,xmm9,0x10
+	vpxord	xmm1,xmm1,xmm2
+	vpclmulqdq	xmm2,xmm10,xmm0,0x01
+	vpshufd	xmm0,xmm0,0x4e
+	vpternlogd	xmm1,xmm0,xmm2,0x96
+	vpclmulqdq	xmm5,xmm5,xmm9,0x11
+	vpclmulqdq	xmm0,xmm10,xmm1,0x01
+	vpshufd	xmm1,xmm1,0x4e
+	vpternlogd	xmm5,xmm1,xmm0,0x96
+
+	add	r8,16
+	sub	r9,16
+	jnz	NEAR $L$aad_loop_blockbyblock__func2
+
+$L$aad_done__func2:
+
+	vpshufb	xmm5,xmm5,xmm4
+	vmovdqu	XMMWORD[rcx],xmm5
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	movdqa	xmm10,XMMWORD[64+rsp]
+	movdqa	xmm11,XMMWORD[80+rsp]
+	movdqa	xmm12,XMMWORD[96+rsp]
+	movdqa	xmm13,XMMWORD[112+rsp]
+	add	rsp,136
+	ret
+$L$SEH_end_gcm_ghash_vpclmulqdq_avx10_512_12:
+
+
+global	aes_gcm_enc_update_vaes_avx10_512
+
+ALIGN	32
+aes_gcm_enc_update_vaes_avx10_512:
+
+$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1:
+_CET_ENDBR
+	push	rsi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_2:
+	push	rdi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_3:
+	push	r12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_4:
+
+	mov	rsi,QWORD[64+rsp]
+	mov	rdi,QWORD[72+rsp]
+	mov	r12,QWORD[80+rsp]
+	sub	rsp,160
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_5:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_6:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_7:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_8:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_9:
+	movdqa	XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_10:
+	movdqa	XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_11:
+	movdqa	XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_12:
+	movdqa	XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_13:
+	movdqa	XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_14:
+	movdqa	XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_15:
+
+$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx10_512_16:
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN	BORINGSSL_function_hit
+	mov	BYTE[((BORINGSSL_function_hit+7))],1
+%endif
+
+	vbroadcasti32x4	zmm8,ZMMWORD[$L$bswap_mask]
+	vbroadcasti32x4	zmm31,ZMMWORD[$L$gfpoly]
+
+
+
+	vmovdqu	xmm10,XMMWORD[r12]
+	vpshufb	xmm10,xmm10,xmm8
+	vbroadcasti32x4	zmm12,ZMMWORD[rsi]
+	vpshufb	zmm12,zmm12,zmm8
+
+
+
+	mov	r10d,DWORD[240+r9]
+	lea	r10d,[((-20))+r10*4]
+
+
+
+
+	lea	r11,[96+r10*4+r9]
+	vbroadcasti32x4	zmm13,ZMMWORD[r9]
+	vbroadcasti32x4	zmm14,ZMMWORD[r11]
+
+
+	vpaddd	zmm12,zmm12,ZMMWORD[$L$ctr_pattern]
+
+
+	vbroadcasti32x4	zmm11,ZMMWORD[$L$inc_4blocks]
+
+
+
+	cmp	r8,4*64-1
+	jbe	NEAR $L$crypt_loop_4x_done__func3
+
+
+	vmovdqu8	zmm27,ZMMWORD[((256-256))+rdi]
+	vmovdqu8	zmm28,ZMMWORD[((256-192))+rdi]
+	vmovdqu8	zmm29,ZMMWORD[((256-128))+rdi]
+	vmovdqu8	zmm30,ZMMWORD[((256-64))+rdi]
+
+
+
+
+	vpshufb	zmm0,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+	vpshufb	zmm1,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+	vpshufb	zmm2,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+	vpshufb	zmm3,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+
+
+	vpxord	zmm0,zmm0,zmm13
+	vpxord	zmm1,zmm1,zmm13
+	vpxord	zmm2,zmm2,zmm13
+	vpxord	zmm3,zmm3,zmm13
+
+	lea	rax,[16+r9]
+$L$vaesenc_loop_first_4_vecs__func3:
+	vbroadcasti32x4	zmm9,ZMMWORD[rax]
+	vaesenc	zmm0,zmm0,zmm9
+	vaesenc	zmm1,zmm1,zmm9
+	vaesenc	zmm2,zmm2,zmm9
+	vaesenc	zmm3,zmm3,zmm9
+
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_first_4_vecs__func3
+
+
+
+	vpxord	zmm4,zmm14,ZMMWORD[rcx]
+	vpxord	zmm5,zmm14,ZMMWORD[64+rcx]
+	vpxord	zmm6,zmm14,ZMMWORD[128+rcx]
+	vpxord	zmm7,zmm14,ZMMWORD[192+rcx]
+
+
+
+	vaesenclast	zmm4,zmm0,zmm4
+	vaesenclast	zmm5,zmm1,zmm5
+	vaesenclast	zmm6,zmm2,zmm6
+	vaesenclast	zmm7,zmm3,zmm7
+
+
+	vmovdqu8	ZMMWORD[rdx],zmm4
+	vmovdqu8	ZMMWORD[64+rdx],zmm5
+	vmovdqu8	ZMMWORD[128+rdx],zmm6
+	vmovdqu8	ZMMWORD[192+rdx],zmm7
+
+	sub	rcx,-4*64
+	sub	rdx,-4*64
+	add	r8,-4*64
+	cmp	r8,4*64-1
+	jbe	NEAR $L$ghash_last_ciphertext_4x__func3
+	vbroadcasti32x4	zmm15,ZMMWORD[((-144))+r11]
+	vbroadcasti32x4	zmm16,ZMMWORD[((-128))+r11]
+	vbroadcasti32x4	zmm17,ZMMWORD[((-112))+r11]
+	vbroadcasti32x4	zmm18,ZMMWORD[((-96))+r11]
+	vbroadcasti32x4	zmm19,ZMMWORD[((-80))+r11]
+	vbroadcasti32x4	zmm20,ZMMWORD[((-64))+r11]
+	vbroadcasti32x4	zmm21,ZMMWORD[((-48))+r11]
+	vbroadcasti32x4	zmm22,ZMMWORD[((-32))+r11]
+	vbroadcasti32x4	zmm23,ZMMWORD[((-16))+r11]
+$L$crypt_loop_4x__func3:
+
+
+
+	vpshufb	zmm0,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+	vpshufb	zmm1,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+	vpshufb	zmm2,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+	vpshufb	zmm3,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+
+
+	vpxord	zmm0,zmm0,zmm13
+	vpxord	zmm1,zmm1,zmm13
+	vpxord	zmm2,zmm2,zmm13
+	vpxord	zmm3,zmm3,zmm13
+
+	cmp	r10d,24
+	jl	NEAR $L$aes128__func3
+	je	NEAR $L$aes192__func3
+
+	vbroadcasti32x4	zmm9,ZMMWORD[((-208))+r11]
+	vaesenc	zmm0,zmm0,zmm9
+	vaesenc	zmm1,zmm1,zmm9
+	vaesenc	zmm2,zmm2,zmm9
+	vaesenc	zmm3,zmm3,zmm9
+
+	vbroadcasti32x4	zmm9,ZMMWORD[((-192))+r11]
+	vaesenc	zmm0,zmm0,zmm9
+	vaesenc	zmm1,zmm1,zmm9
+	vaesenc	zmm2,zmm2,zmm9
+	vaesenc	zmm3,zmm3,zmm9
+
+$L$aes192__func3:
+	vbroadcasti32x4	zmm9,ZMMWORD[((-176))+r11]
+	vaesenc	zmm0,zmm0,zmm9
+	vaesenc	zmm1,zmm1,zmm9
+	vaesenc	zmm2,zmm2,zmm9
+	vaesenc	zmm3,zmm3,zmm9
+
+	vbroadcasti32x4	zmm9,ZMMWORD[((-160))+r11]
+	vaesenc	zmm0,zmm0,zmm9
+	vaesenc	zmm1,zmm1,zmm9
+	vaesenc	zmm2,zmm2,zmm9
+	vaesenc	zmm3,zmm3,zmm9
+
+$L$aes128__func3:
+	vpshufb	zmm4,zmm4,zmm8
+	vpxord	zmm4,zmm4,zmm10
+	vpshufb	zmm5,zmm5,zmm8
+	vpshufb	zmm6,zmm6,zmm8
+
+	vaesenc	zmm0,zmm0,zmm15
+	vaesenc	zmm1,zmm1,zmm15
+	vaesenc	zmm2,zmm2,zmm15
+	vaesenc	zmm3,zmm3,zmm15
+
+	vpshufb	zmm7,zmm7,zmm8
+	vpclmulqdq	zmm10,zmm4,zmm27,0x00
+	vpclmulqdq	zmm24,zmm5,zmm28,0x00
+	vpclmulqdq	zmm25,zmm6,zmm29,0x00
+
+	vaesenc	zmm0,zmm0,zmm16
+	vaesenc	zmm1,zmm1,zmm16
+	vaesenc	zmm2,zmm2,zmm16
+	vaesenc	zmm3,zmm3,zmm16
+
+	vpxord	zmm10,zmm10,zmm24
+	vpclmulqdq	zmm26,zmm7,zmm30,0x00
+	vpternlogd	zmm10,zmm25,zmm26,0x96
+	vpclmulqdq	zmm24,zmm4,zmm27,0x01
+
+	vaesenc	zmm0,zmm0,zmm17
+	vaesenc	zmm1,zmm1,zmm17
+	vaesenc	zmm2,zmm2,zmm17
+	vaesenc	zmm3,zmm3,zmm17
+
+	vpclmulqdq	zmm25,zmm5,zmm28,0x01
+	vpclmulqdq	zmm26,zmm6,zmm29,0x01
+	vpternlogd	zmm24,zmm25,zmm26,0x96
+	vpclmulqdq	zmm25,zmm7,zmm30,0x01
+
+	vaesenc	zmm0,zmm0,zmm18
+	vaesenc	zmm1,zmm1,zmm18
+	vaesenc	zmm2,zmm2,zmm18
+	vaesenc	zmm3,zmm3,zmm18
+
+	vpclmulqdq	zmm26,zmm4,zmm27,0x10
+	vpternlogd	zmm24,zmm25,zmm26,0x96
+	vpclmulqdq	zmm25,zmm5,zmm28,0x10
+	vpclmulqdq	zmm26,zmm6,zmm29,0x10
+
+	vaesenc	zmm0,zmm0,zmm19
+	vaesenc	zmm1,zmm1,zmm19
+	vaesenc	zmm2,zmm2,zmm19
+	vaesenc	zmm3,zmm3,zmm19
+
+	vpternlogd	zmm24,zmm25,zmm26,0x96
+	vpclmulqdq	zmm26,zmm31,zmm10,0x01
+	vpclmulqdq	zmm25,zmm7,zmm30,0x10
+	vpxord	zmm24,zmm24,zmm25
+
+	vaesenc	zmm0,zmm0,zmm20
+	vaesenc	zmm1,zmm1,zmm20
+	vaesenc	zmm2,zmm2,zmm20
+	vaesenc	zmm3,zmm3,zmm20
+
+	vpshufd	zmm10,zmm10,0x4e
+	vpclmulqdq	zmm4,zmm4,zmm27,0x11
+	vpclmulqdq	zmm5,zmm5,zmm28,0x11
+	vpclmulqdq	zmm6,zmm6,zmm29,0x11
+
+	vaesenc	zmm0,zmm0,zmm21
+	vaesenc	zmm1,zmm1,zmm21
+	vaesenc	zmm2,zmm2,zmm21
+	vaesenc	zmm3,zmm3,zmm21
+
+	vpternlogd	zmm24,zmm10,zmm26,0x96
+	vpclmulqdq	zmm7,zmm7,zmm30,0x11
+	vpternlogd	zmm4,zmm5,zmm6,0x96
+	vpclmulqdq	zmm25,zmm31,zmm24,0x01
+
+	vaesenc	zmm0,zmm0,zmm22
+	vaesenc	zmm1,zmm1,zmm22
+	vaesenc	zmm2,zmm2,zmm22
+	vaesenc	zmm3,zmm3,zmm22
+
+	vpxord	zmm10,zmm4,zmm7
+	vpshufd	zmm24,zmm24,0x4e
+	vpternlogd	zmm10,zmm24,zmm25,0x96
+
+	vaesenc	zmm0,zmm0,zmm23
+	vaesenc	zmm1,zmm1,zmm23
+	vaesenc	zmm2,zmm2,zmm23
+	vaesenc	zmm3,zmm3,zmm23
+
+	vextracti32x4	xmm4,zmm10,1
+	vextracti32x4	xmm5,zmm10,2
+	vextracti32x4	xmm6,zmm10,3
+	vpxord	xmm10,xmm10,xmm4
+	vpternlogd	xmm10,xmm6,xmm5,0x96
+
+
+
+
+	vpxord	zmm4,zmm14,ZMMWORD[rcx]
+	vpxord	zmm5,zmm14,ZMMWORD[64+rcx]
+	vpxord	zmm6,zmm14,ZMMWORD[128+rcx]
+	vpxord	zmm7,zmm14,ZMMWORD[192+rcx]
+
+
+
+	vaesenclast	zmm4,zmm0,zmm4
+	vaesenclast	zmm5,zmm1,zmm5
+	vaesenclast	zmm6,zmm2,zmm6
+	vaesenclast	zmm7,zmm3,zmm7
+
+
+	vmovdqu8	ZMMWORD[rdx],zmm4
+	vmovdqu8	ZMMWORD[64+rdx],zmm5
+	vmovdqu8	ZMMWORD[128+rdx],zmm6
+	vmovdqu8	ZMMWORD[192+rdx],zmm7
+
+	sub	rcx,-4*64
+	sub	rdx,-4*64
+	add	r8,-4*64
+	cmp	r8,4*64-1
+	ja	NEAR $L$crypt_loop_4x__func3
+$L$ghash_last_ciphertext_4x__func3:
+	vpshufb	zmm4,zmm4,zmm8
+	vpxord	zmm4,zmm4,zmm10
+	vpshufb	zmm5,zmm5,zmm8
+	vpshufb	zmm6,zmm6,zmm8
+	vpshufb	zmm7,zmm7,zmm8
+	vpclmulqdq	zmm10,zmm4,zmm27,0x00
+	vpclmulqdq	zmm24,zmm5,zmm28,0x00
+	vpclmulqdq	zmm25,zmm6,zmm29,0x00
+	vpxord	zmm10,zmm10,zmm24
+	vpclmulqdq	zmm26,zmm7,zmm30,0x00
+	vpternlogd	zmm10,zmm25,zmm26,0x96
+	vpclmulqdq	zmm24,zmm4,zmm27,0x01
+	vpclmulqdq	zmm25,zmm5,zmm28,0x01
+	vpclmulqdq	zmm26,zmm6,zmm29,0x01
+	vpternlogd	zmm24,zmm25,zmm26,0x96
+	vpclmulqdq	zmm25,zmm7,zmm30,0x01
+	vpclmulqdq	zmm26,zmm4,zmm27,0x10
+	vpternlogd	zmm24,zmm25,zmm26,0x96
+	vpclmulqdq	zmm25,zmm5,zmm28,0x10
+	vpclmulqdq	zmm26,zmm6,zmm29,0x10
+	vpternlogd	zmm24,zmm25,zmm26,0x96
+	vpclmulqdq	zmm26,zmm31,zmm10,0x01
+	vpclmulqdq	zmm25,zmm7,zmm30,0x10
+	vpxord	zmm24,zmm24,zmm25
+	vpshufd	zmm10,zmm10,0x4e
+	vpclmulqdq	zmm4,zmm4,zmm27,0x11
+	vpclmulqdq	zmm5,zmm5,zmm28,0x11
+	vpclmulqdq	zmm6,zmm6,zmm29,0x11
+	vpternlogd	zmm24,zmm10,zmm26,0x96
+	vpclmulqdq	zmm7,zmm7,zmm30,0x11
+	vpternlogd	zmm4,zmm5,zmm6,0x96
+	vpclmulqdq	zmm25,zmm31,zmm24,0x01
+	vpxord	zmm10,zmm4,zmm7
+	vpshufd	zmm24,zmm24,0x4e
+	vpternlogd	zmm10,zmm24,zmm25,0x96
+	vextracti32x4	xmm4,zmm10,1
+	vextracti32x4	xmm5,zmm10,2
+	vextracti32x4	xmm6,zmm10,3
+	vpxord	xmm10,xmm10,xmm4
+	vpternlogd	xmm10,xmm6,xmm5,0x96
+
+$L$crypt_loop_4x_done__func3:
+
+	test	r8,r8
+	jz	NEAR $L$done__func3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	rax,r8
+	neg	rax
+	and	rax,-16
+	lea	rsi,[256+rax*1+rdi]
+	vpxor	xmm4,xmm4,xmm4
+	vpxor	xmm5,xmm5,xmm5
+	vpxor	xmm6,xmm6,xmm6
+
+	cmp	r8,64
+	jb	NEAR $L$partial_vec__func3
+
+$L$crypt_loop_1x__func3:
+
+
+
+	vpshufb	zmm0,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+	vpxord	zmm0,zmm0,zmm13
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_full_vec__func3:
+	vbroadcasti32x4	zmm9,ZMMWORD[rax]
+	vaesenc	zmm0,zmm0,zmm9
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_full_vec__func3
+	vaesenclast	zmm0,zmm0,zmm14
+
+
+	vmovdqu8	zmm1,ZMMWORD[rcx]
+	vpxord	zmm0,zmm0,zmm1
+	vmovdqu8	ZMMWORD[rdx],zmm0
+
+
+	vmovdqu8	zmm30,ZMMWORD[rsi]
+	vpshufb	zmm0,zmm0,zmm8
+	vpxord	zmm0,zmm0,zmm10
+	vpclmulqdq	zmm7,zmm0,zmm30,0x00
+	vpclmulqdq	zmm1,zmm0,zmm30,0x01
+	vpclmulqdq	zmm2,zmm0,zmm30,0x10
+	vpclmulqdq	zmm3,zmm0,zmm30,0x11
+	vpxord	zmm4,zmm4,zmm7
+	vpternlogd	zmm5,zmm1,zmm2,0x96
+	vpxord	zmm6,zmm6,zmm3
+
+	vpxor	xmm10,xmm10,xmm10
+
+	add	rsi,64
+	add	rcx,64
+	add	rdx,64
+	sub	r8,64
+	cmp	r8,64
+	jae	NEAR $L$crypt_loop_1x__func3
+
+	test	r8,r8
+	jz	NEAR $L$reduce__func3
+
+$L$partial_vec__func3:
+
+
+
+
+	mov	rax,-1
+	bzhi	rax,rax,r8
+	kmovq	k1,rax
+	add	r8,15
+	and	r8,-16
+	mov	rax,-1
+	bzhi	rax,rax,r8
+	kmovq	k2,rax
+
+
+
+	vpshufb	zmm0,zmm12,zmm8
+	vpxord	zmm0,zmm0,zmm13
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_partialvec__func3:
+	vbroadcasti32x4	zmm9,ZMMWORD[rax]
+	vaesenc	zmm0,zmm0,zmm9
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_partialvec__func3
+	vaesenclast	zmm0,zmm0,zmm14
+
+
+	vmovdqu8	zmm1{k1}{z},[rcx]
+	vpxord	zmm0,zmm0,zmm1
+	vmovdqu8	ZMMWORD[rdx]{k1},zmm0
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	zmm30{k2}{z},[rsi]
+	vmovdqu8	zmm1{k1}{z},zmm0
+	vpshufb	zmm0,zmm1,zmm8
+	vpxord	zmm0,zmm0,zmm10
+	vpclmulqdq	zmm7,zmm0,zmm30,0x00
+	vpclmulqdq	zmm1,zmm0,zmm30,0x01
+	vpclmulqdq	zmm2,zmm0,zmm30,0x10
+	vpclmulqdq	zmm3,zmm0,zmm30,0x11
+	vpxord	zmm4,zmm4,zmm7
+	vpternlogd	zmm5,zmm1,zmm2,0x96
+	vpxord	zmm6,zmm6,zmm3
+
+
+$L$reduce__func3:
+
+	vpclmulqdq	zmm0,zmm31,zmm4,0x01
+	vpshufd	zmm4,zmm4,0x4e
+	vpternlogd	zmm5,zmm4,zmm0,0x96
+	vpclmulqdq	zmm0,zmm31,zmm5,0x01
+	vpshufd	zmm5,zmm5,0x4e
+	vpternlogd	zmm6,zmm5,zmm0,0x96
+
+	vextracti32x4	xmm0,zmm6,1
+	vextracti32x4	xmm1,zmm6,2
+	vextracti32x4	xmm2,zmm6,3
+	vpxord	xmm10,xmm6,xmm0
+	vpternlogd	xmm10,xmm2,xmm1,0x96
+
+
+$L$done__func3:
+
+	vpshufb	xmm10,xmm10,xmm8
+	vmovdqu	XMMWORD[r12],xmm10
+
+	vzeroupper
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	movdqa	xmm10,XMMWORD[64+rsp]
+	movdqa	xmm11,XMMWORD[80+rsp]
+	movdqa	xmm12,XMMWORD[96+rsp]
+	movdqa	xmm13,XMMWORD[112+rsp]
+	movdqa	xmm14,XMMWORD[128+rsp]
+	movdqa	xmm15,XMMWORD[144+rsp]
+	add	rsp,160
+	pop	r12
+	pop	rdi
+	pop	rsi
+	ret
+$L$SEH_end_aes_gcm_enc_update_vaes_avx10_512_17:
+
+
+global	aes_gcm_dec_update_vaes_avx10_512
+
+ALIGN	32
+aes_gcm_dec_update_vaes_avx10_512:
+
+$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1:
+_CET_ENDBR
+	push	rsi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_2:
+	push	rdi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_3:
+	push	r12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_4:
+
+	mov	rsi,QWORD[64+rsp]
+	mov	rdi,QWORD[72+rsp]
+	mov	r12,QWORD[80+rsp]
+	sub	rsp,160
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_5:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_6:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_7:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_8:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_9:
+	movdqa	XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_10:
+	movdqa	XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_11:
+	movdqa	XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_12:
+	movdqa	XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_13:
+	movdqa	XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_14:
+	movdqa	XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_15:
+
+$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx10_512_16:
+
+	vbroadcasti32x4	zmm8,ZMMWORD[$L$bswap_mask]
+	vbroadcasti32x4	zmm31,ZMMWORD[$L$gfpoly]
+
+
+
+	vmovdqu	xmm10,XMMWORD[r12]
+	vpshufb	xmm10,xmm10,xmm8
+	vbroadcasti32x4	zmm12,ZMMWORD[rsi]
+	vpshufb	zmm12,zmm12,zmm8
+
+
+
+	mov	r10d,DWORD[240+r9]
+	lea	r10d,[((-20))+r10*4]
+
+
+
+
+	lea	r11,[96+r10*4+r9]
+	vbroadcasti32x4	zmm13,ZMMWORD[r9]
+	vbroadcasti32x4	zmm14,ZMMWORD[r11]
+
+
+	vpaddd	zmm12,zmm12,ZMMWORD[$L$ctr_pattern]
+
+
+	vbroadcasti32x4	zmm11,ZMMWORD[$L$inc_4blocks]
+
+
+
+	cmp	r8,4*64-1
+	jbe	NEAR $L$crypt_loop_4x_done__func4
+
+
+	vmovdqu8	zmm27,ZMMWORD[((256-256))+rdi]
+	vmovdqu8	zmm28,ZMMWORD[((256-192))+rdi]
+	vmovdqu8	zmm29,ZMMWORD[((256-128))+rdi]
+	vmovdqu8	zmm30,ZMMWORD[((256-64))+rdi]
+	vbroadcasti32x4	zmm15,ZMMWORD[((-144))+r11]
+	vbroadcasti32x4	zmm16,ZMMWORD[((-128))+r11]
+	vbroadcasti32x4	zmm17,ZMMWORD[((-112))+r11]
+	vbroadcasti32x4	zmm18,ZMMWORD[((-96))+r11]
+	vbroadcasti32x4	zmm19,ZMMWORD[((-80))+r11]
+	vbroadcasti32x4	zmm20,ZMMWORD[((-64))+r11]
+	vbroadcasti32x4	zmm21,ZMMWORD[((-48))+r11]
+	vbroadcasti32x4	zmm22,ZMMWORD[((-32))+r11]
+	vbroadcasti32x4	zmm23,ZMMWORD[((-16))+r11]
+$L$crypt_loop_4x__func4:
+	vmovdqu8	zmm4,ZMMWORD[rcx]
+	vmovdqu8	zmm5,ZMMWORD[64+rcx]
+	vmovdqu8	zmm6,ZMMWORD[128+rcx]
+	vmovdqu8	zmm7,ZMMWORD[192+rcx]
+
+
+
+	vpshufb	zmm0,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+	vpshufb	zmm1,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+	vpshufb	zmm2,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+	vpshufb	zmm3,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+
+
+	vpxord	zmm0,zmm0,zmm13
+	vpxord	zmm1,zmm1,zmm13
+	vpxord	zmm2,zmm2,zmm13
+	vpxord	zmm3,zmm3,zmm13
+
+	cmp	r10d,24
+	jl	NEAR $L$aes128__func4
+	je	NEAR $L$aes192__func4
+
+	vbroadcasti32x4	zmm9,ZMMWORD[((-208))+r11]
+	vaesenc	zmm0,zmm0,zmm9
+	vaesenc	zmm1,zmm1,zmm9
+	vaesenc	zmm2,zmm2,zmm9
+	vaesenc	zmm3,zmm3,zmm9
+
+	vbroadcasti32x4	zmm9,ZMMWORD[((-192))+r11]
+	vaesenc	zmm0,zmm0,zmm9
+	vaesenc	zmm1,zmm1,zmm9
+	vaesenc	zmm2,zmm2,zmm9
+	vaesenc	zmm3,zmm3,zmm9
+
+$L$aes192__func4:
+	vbroadcasti32x4	zmm9,ZMMWORD[((-176))+r11]
+	vaesenc	zmm0,zmm0,zmm9
+	vaesenc	zmm1,zmm1,zmm9
+	vaesenc	zmm2,zmm2,zmm9
+	vaesenc	zmm3,zmm3,zmm9
+
+	vbroadcasti32x4	zmm9,ZMMWORD[((-160))+r11]
+	vaesenc	zmm0,zmm0,zmm9
+	vaesenc	zmm1,zmm1,zmm9
+	vaesenc	zmm2,zmm2,zmm9
+	vaesenc	zmm3,zmm3,zmm9
+
+$L$aes128__func4:
+	vpshufb	zmm4,zmm4,zmm8
+	vpxord	zmm4,zmm4,zmm10
+	vpshufb	zmm5,zmm5,zmm8
+	vpshufb	zmm6,zmm6,zmm8
+
+	vaesenc	zmm0,zmm0,zmm15
+	vaesenc	zmm1,zmm1,zmm15
+	vaesenc	zmm2,zmm2,zmm15
+	vaesenc	zmm3,zmm3,zmm15
+
+	vpshufb	zmm7,zmm7,zmm8
+	vpclmulqdq	zmm10,zmm4,zmm27,0x00
+	vpclmulqdq	zmm24,zmm5,zmm28,0x00
+	vpclmulqdq	zmm25,zmm6,zmm29,0x00
+
+	vaesenc	zmm0,zmm0,zmm16
+	vaesenc	zmm1,zmm1,zmm16
+	vaesenc	zmm2,zmm2,zmm16
+	vaesenc	zmm3,zmm3,zmm16
+
+	vpxord	zmm10,zmm10,zmm24
+	vpclmulqdq	zmm26,zmm7,zmm30,0x00
+	vpternlogd	zmm10,zmm25,zmm26,0x96
+	vpclmulqdq	zmm24,zmm4,zmm27,0x01
+
+	vaesenc	zmm0,zmm0,zmm17
+	vaesenc	zmm1,zmm1,zmm17
+	vaesenc	zmm2,zmm2,zmm17
+	vaesenc	zmm3,zmm3,zmm17
+
+	vpclmulqdq	zmm25,zmm5,zmm28,0x01
+	vpclmulqdq	zmm26,zmm6,zmm29,0x01
+	vpternlogd	zmm24,zmm25,zmm26,0x96
+	vpclmulqdq	zmm25,zmm7,zmm30,0x01
+
+	vaesenc	zmm0,zmm0,zmm18
+	vaesenc	zmm1,zmm1,zmm18
+	vaesenc	zmm2,zmm2,zmm18
+	vaesenc	zmm3,zmm3,zmm18
+
+	vpclmulqdq	zmm26,zmm4,zmm27,0x10
+	vpternlogd	zmm24,zmm25,zmm26,0x96
+	vpclmulqdq	zmm25,zmm5,zmm28,0x10
+	vpclmulqdq	zmm26,zmm6,zmm29,0x10
+
+	vaesenc	zmm0,zmm0,zmm19
+	vaesenc	zmm1,zmm1,zmm19
+	vaesenc	zmm2,zmm2,zmm19
+	vaesenc	zmm3,zmm3,zmm19
+
+	vpternlogd	zmm24,zmm25,zmm26,0x96
+	vpclmulqdq	zmm26,zmm31,zmm10,0x01
+	vpclmulqdq	zmm25,zmm7,zmm30,0x10
+	vpxord	zmm24,zmm24,zmm25
+
+	vaesenc	zmm0,zmm0,zmm20
+	vaesenc	zmm1,zmm1,zmm20
+	vaesenc	zmm2,zmm2,zmm20
+	vaesenc	zmm3,zmm3,zmm20
+
+	vpshufd	zmm10,zmm10,0x4e
+	vpclmulqdq	zmm4,zmm4,zmm27,0x11
+	vpclmulqdq	zmm5,zmm5,zmm28,0x11
+	vpclmulqdq	zmm6,zmm6,zmm29,0x11
+
+	vaesenc	zmm0,zmm0,zmm21
+	vaesenc	zmm1,zmm1,zmm21
+	vaesenc	zmm2,zmm2,zmm21
+	vaesenc	zmm3,zmm3,zmm21
+
+	vpternlogd	zmm24,zmm10,zmm26,0x96
+	vpclmulqdq	zmm7,zmm7,zmm30,0x11
+	vpternlogd	zmm4,zmm5,zmm6,0x96
+	vpclmulqdq	zmm25,zmm31,zmm24,0x01
+
+	vaesenc	zmm0,zmm0,zmm22
+	vaesenc	zmm1,zmm1,zmm22
+	vaesenc	zmm2,zmm2,zmm22
+	vaesenc	zmm3,zmm3,zmm22
+
+	vpxord	zmm10,zmm4,zmm7
+	vpshufd	zmm24,zmm24,0x4e
+	vpternlogd	zmm10,zmm24,zmm25,0x96
+
+	vaesenc	zmm0,zmm0,zmm23
+	vaesenc	zmm1,zmm1,zmm23
+	vaesenc	zmm2,zmm2,zmm23
+	vaesenc	zmm3,zmm3,zmm23
+
+	vextracti32x4	xmm4,zmm10,1
+	vextracti32x4	xmm5,zmm10,2
+	vextracti32x4	xmm6,zmm10,3
+	vpxord	xmm10,xmm10,xmm4
+	vpternlogd	xmm10,xmm6,xmm5,0x96
+
+
+
+
+	vpxord	zmm4,zmm14,ZMMWORD[rcx]
+	vpxord	zmm5,zmm14,ZMMWORD[64+rcx]
+	vpxord	zmm6,zmm14,ZMMWORD[128+rcx]
+	vpxord	zmm7,zmm14,ZMMWORD[192+rcx]
+
+
+
+	vaesenclast	zmm4,zmm0,zmm4
+	vaesenclast	zmm5,zmm1,zmm5
+	vaesenclast	zmm6,zmm2,zmm6
+	vaesenclast	zmm7,zmm3,zmm7
+
+
+	vmovdqu8	ZMMWORD[rdx],zmm4
+	vmovdqu8	ZMMWORD[64+rdx],zmm5
+	vmovdqu8	ZMMWORD[128+rdx],zmm6
+	vmovdqu8	ZMMWORD[192+rdx],zmm7
+
+	sub	rcx,-4*64
+	sub	rdx,-4*64
+	add	r8,-4*64
+	cmp	r8,4*64-1
+	ja	NEAR $L$crypt_loop_4x__func4
+$L$crypt_loop_4x_done__func4:
+
+	test	r8,r8
+	jz	NEAR $L$done__func4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	rax,r8
+	neg	rax
+	and	rax,-16
+	lea	rsi,[256+rax*1+rdi]
+	vpxor	xmm4,xmm4,xmm4
+	vpxor	xmm5,xmm5,xmm5
+	vpxor	xmm6,xmm6,xmm6
+
+	cmp	r8,64
+	jb	NEAR $L$partial_vec__func4
+
+$L$crypt_loop_1x__func4:
+
+
+
+	vpshufb	zmm0,zmm12,zmm8
+	vpaddd	zmm12,zmm12,zmm11
+	vpxord	zmm0,zmm0,zmm13
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_full_vec__func4:
+	vbroadcasti32x4	zmm9,ZMMWORD[rax]
+	vaesenc	zmm0,zmm0,zmm9
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_full_vec__func4
+	vaesenclast	zmm0,zmm0,zmm14
+
+
+	vmovdqu8	zmm1,ZMMWORD[rcx]
+	vpxord	zmm0,zmm0,zmm1
+	vmovdqu8	ZMMWORD[rdx],zmm0
+
+
+	vmovdqu8	zmm30,ZMMWORD[rsi]
+	vpshufb	zmm0,zmm1,zmm8
+	vpxord	zmm0,zmm0,zmm10
+	vpclmulqdq	zmm7,zmm0,zmm30,0x00
+	vpclmulqdq	zmm1,zmm0,zmm30,0x01
+	vpclmulqdq	zmm2,zmm0,zmm30,0x10
+	vpclmulqdq	zmm3,zmm0,zmm30,0x11
+	vpxord	zmm4,zmm4,zmm7
+	vpternlogd	zmm5,zmm1,zmm2,0x96
+	vpxord	zmm6,zmm6,zmm3
+
+	vpxor	xmm10,xmm10,xmm10
+
+	add	rsi,64
+	add	rcx,64
+	add	rdx,64
+	sub	r8,64
+	cmp	r8,64
+	jae	NEAR $L$crypt_loop_1x__func4
+
+	test	r8,r8
+	jz	NEAR $L$reduce__func4
+
+$L$partial_vec__func4:
+
+
+
+
+	mov	rax,-1
+	bzhi	rax,rax,r8
+	kmovq	k1,rax
+	add	r8,15
+	and	r8,-16
+	mov	rax,-1
+	bzhi	rax,rax,r8
+	kmovq	k2,rax
+
+
+
+	vpshufb	zmm0,zmm12,zmm8
+	vpxord	zmm0,zmm0,zmm13
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_partialvec__func4:
+	vbroadcasti32x4	zmm9,ZMMWORD[rax]
+	vaesenc	zmm0,zmm0,zmm9
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_partialvec__func4
+	vaesenclast	zmm0,zmm0,zmm14
+
+
+	vmovdqu8	zmm1{k1}{z},[rcx]
+	vpxord	zmm0,zmm0,zmm1
+	vmovdqu8	ZMMWORD[rdx]{k1},zmm0
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vmovdqu8	zmm30{k2}{z},[rsi]
+
+	vpshufb	zmm0,zmm1,zmm8
+	vpxord	zmm0,zmm0,zmm10
+	vpclmulqdq	zmm7,zmm0,zmm30,0x00
+	vpclmulqdq	zmm1,zmm0,zmm30,0x01
+	vpclmulqdq	zmm2,zmm0,zmm30,0x10
+	vpclmulqdq	zmm3,zmm0,zmm30,0x11
+	vpxord	zmm4,zmm4,zmm7
+	vpternlogd	zmm5,zmm1,zmm2,0x96
+	vpxord	zmm6,zmm6,zmm3
+
+
+$L$reduce__func4:
+
+	vpclmulqdq	zmm0,zmm31,zmm4,0x01
+	vpshufd	zmm4,zmm4,0x4e
+	vpternlogd	zmm5,zmm4,zmm0,0x96
+	vpclmulqdq	zmm0,zmm31,zmm5,0x01
+	vpshufd	zmm5,zmm5,0x4e
+	vpternlogd	zmm6,zmm5,zmm0,0x96
+
+	vextracti32x4	xmm0,zmm6,1
+	vextracti32x4	xmm1,zmm6,2
+	vextracti32x4	xmm2,zmm6,3
+	vpxord	xmm10,xmm6,xmm0
+	vpternlogd	xmm10,xmm2,xmm1,0x96
+
+
+$L$done__func4:
+
+	vpshufb	xmm10,xmm10,xmm8
+	vmovdqu	XMMWORD[r12],xmm10
+
+	vzeroupper
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	movdqa	xmm10,XMMWORD[64+rsp]
+	movdqa	xmm11,XMMWORD[80+rsp]
+	movdqa	xmm12,XMMWORD[96+rsp]
+	movdqa	xmm13,XMMWORD[112+rsp]
+	movdqa	xmm14,XMMWORD[128+rsp]
+	movdqa	xmm15,XMMWORD[144+rsp]
+	add	rsp,160
+	pop	r12
+	pop	rdi
+	pop	rsi
+	ret
+$L$SEH_end_aes_gcm_dec_update_vaes_avx10_512_17:
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_gcm_gmult_vpclmulqdq_avx10_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_gmult_vpclmulqdq_avx10_5 wrt ..imagebase
+	DD	$L$SEH_info_gcm_gmult_vpclmulqdq_avx10_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_ghash_vpclmulqdq_avx10_256_12 wrt ..imagebase
+	DD	$L$SEH_info_gcm_ghash_vpclmulqdq_avx10_256_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_gcm_enc_update_vaes_avx10_256_17 wrt ..imagebase
+	DD	$L$SEH_info_aes_gcm_enc_update_vaes_avx10_256_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_gcm_dec_update_vaes_avx10_256_17 wrt ..imagebase
+	DD	$L$SEH_info_aes_gcm_dec_update_vaes_avx10_256_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_ghash_vpclmulqdq_avx10_512_12 wrt ..imagebase
+	DD	$L$SEH_info_gcm_ghash_vpclmulqdq_avx10_512_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_gcm_enc_update_vaes_avx10_512_17 wrt ..imagebase
+	DD	$L$SEH_info_aes_gcm_enc_update_vaes_avx10_512_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_gcm_dec_update_vaes_avx10_512_17 wrt ..imagebase
+	DD	$L$SEH_info_aes_gcm_dec_update_vaes_avx10_512_0 wrt ..imagebase
+
+
+section	.xdata rdata align=8
+ALIGN	4
+$L$SEH_info_gcm_gmult_vpclmulqdq_avx10_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx10_4-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx10_1
+	DB	3
+	DB	0
+	DB	$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx10_3-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx10_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx10_2-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx10_1
+	DB	34
+
+	DW	0
+$L$SEH_info_gcm_ghash_vpclmulqdq_avx10_256_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx10_256_11-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+	DB	18
+	DB	0
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_10-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_9-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_8-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_256_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_256_1
+	DB	1
+	DW	17
+
+$L$SEH_info_aes_gcm_enc_update_vaes_avx10_256_0:
+	DB	1
+	DB	$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx10_256_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	25
+	DB	0
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	1
+	DW	20
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	192
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	112
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_256_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_256_1
+	DB	96
+
+	DW	0
+$L$SEH_info_aes_gcm_dec_update_vaes_avx10_256_0:
+	DB	1
+	DB	$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx10_256_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	25
+	DB	0
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	1
+	DW	20
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	192
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	112
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_256_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_256_1
+	DB	96
+
+	DW	0
+$L$SEH_info_gcm_ghash_vpclmulqdq_avx10_512_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx10_512_11-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+	DB	18
+	DB	0
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_10-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_9-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_8-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx10_512_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx10_512_1
+	DB	1
+	DW	17
+
+$L$SEH_info_aes_gcm_enc_update_vaes_avx10_512_0:
+	DB	1
+	DB	$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx10_512_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	25
+	DB	0
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	1
+	DW	20
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	192
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	112
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx10_512_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx10_512_1
+	DB	96
+
+	DW	0
+$L$SEH_info_aes_gcm_dec_update_vaes_avx10_512_0:
+	DB	1
+	DB	$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx10_512_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	25
+	DB	0
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	1
+	DW	20
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	192
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	112
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx10_512_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx10_512_1
+	DB	96
+
+	DW	0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/sources.bzl b/gen/sources.bzl
index b258d42..9fa3e11 100644
--- a/gen/sources.bzl
+++ b/gen/sources.bzl
@@ -94,6 +94,8 @@
 ]
 
 bcm_sources_asm = [
+    "gen/bcm/aes-gcm-avx10-x86_64-apple.S",
+    "gen/bcm/aes-gcm-avx10-x86_64-linux.S",
     "gen/bcm/aesni-gcm-x86_64-apple.S",
     "gen/bcm/aesni-gcm-x86_64-linux.S",
     "gen/bcm/aesni-x86-apple.S",
@@ -192,6 +194,7 @@
 ]
 
 bcm_sources_nasm = [
+    "gen/bcm/aes-gcm-avx10-x86_64-win.asm",
     "gen/bcm/aesni-gcm-x86_64-win.asm",
     "gen/bcm/aesni-x86-win.asm",
     "gen/bcm/aesni-x86_64-win.asm",
diff --git a/gen/sources.cmake b/gen/sources.cmake
index 6d1c32f..3d79734 100644
--- a/gen/sources.cmake
+++ b/gen/sources.cmake
@@ -100,6 +100,8 @@
 set(
   BCM_SOURCES_ASM
 
+  gen/bcm/aes-gcm-avx10-x86_64-apple.S
+  gen/bcm/aes-gcm-avx10-x86_64-linux.S
   gen/bcm/aesni-gcm-x86_64-apple.S
   gen/bcm/aesni-gcm-x86_64-linux.S
   gen/bcm/aesni-x86-apple.S
@@ -200,6 +202,7 @@
 set(
   BCM_SOURCES_NASM
 
+  gen/bcm/aes-gcm-avx10-x86_64-win.asm
   gen/bcm/aesni-gcm-x86_64-win.asm
   gen/bcm/aesni-x86-win.asm
   gen/bcm/aesni-x86_64-win.asm
diff --git a/gen/sources.gni b/gen/sources.gni
index 3b72a79..0abc62b 100644
--- a/gen/sources.gni
+++ b/gen/sources.gni
@@ -94,6 +94,8 @@
 ]
 
 bcm_sources_asm = [
+  "gen/bcm/aes-gcm-avx10-x86_64-apple.S",
+  "gen/bcm/aes-gcm-avx10-x86_64-linux.S",
   "gen/bcm/aesni-gcm-x86_64-apple.S",
   "gen/bcm/aesni-gcm-x86_64-linux.S",
   "gen/bcm/aesni-x86-apple.S",
@@ -192,6 +194,7 @@
 ]
 
 bcm_sources_nasm = [
+  "gen/bcm/aes-gcm-avx10-x86_64-win.asm",
   "gen/bcm/aesni-gcm-x86_64-win.asm",
   "gen/bcm/aesni-x86-win.asm",
   "gen/bcm/aesni-x86_64-win.asm",
diff --git a/gen/sources.json b/gen/sources.json
index ded253d..6afbc27 100644
--- a/gen/sources.json
+++ b/gen/sources.json
@@ -78,6 +78,8 @@
       "crypto/fipsmodule/tls/kdf.cc.inc"
     ],
     "asm": [
+      "gen/bcm/aes-gcm-avx10-x86_64-apple.S",
+      "gen/bcm/aes-gcm-avx10-x86_64-linux.S",
       "gen/bcm/aesni-gcm-x86_64-apple.S",
       "gen/bcm/aesni-gcm-x86_64-linux.S",
       "gen/bcm/aesni-x86-apple.S",
@@ -175,6 +177,7 @@
       "third_party/fiat/asm/fiat_p256_adx_sqr.S"
     ],
     "nasm": [
+      "gen/bcm/aes-gcm-avx10-x86_64-win.asm",
       "gen/bcm/aesni-gcm-x86_64-win.asm",
       "gen/bcm/aesni-x86-win.asm",
       "gen/bcm/aesni-x86_64-win.asm",