Add VAES + AVX2 optimized AES-GCM

Add a VAES-optimized AES-GCM implementation that is optimized for AMD
Zen 3 processors, using AVX2 instead of AVX512 / AVX10.  With AVX2 only
16 vector registers are available and some instructions are missing,
which is inconvenient and makes the code not easily sharable with the
AVX512 / AVX10 version.  However, using VAES still gives a significant
performance improvement, about 80-85% on long messages as shown by the
following tables which show the change in AES-256-GCM throughput in MB/s
on a Zen 3 "Milan" processor for various message lengths in bytes.

Encryption:

            | 16384 |  4096 |  4095 |  1420 |   512 |   500 |
    --------+-------+-------+-------+-------+-------+-------+
    Before  |  3955 |  3749 |  3597 |  3054 |  2411 |  2038 |
    After   |  7128 |  6631 |  5975 |  4788 |  3807 |  2676 |

            |   300 |   200 |    64 |    63 |    16 |
    --------+-------+-------+-------+-------+-------+
    Before  |  1757 |  1405 |   856 |   602 |   356 |
    After   |  1885 |  1430 |   940 |   593 |   381 |

Decryption:

            | 16384 |  4096 |  4095 |  1420 |   512 |   500 |
    --------+-------+-------+-------+-------+-------+-------+
    Before  |  3962 |  3774 |  3593 |  2978 |  2510 |  1998 |
    After   |  7378 |  6836 |  6282 |  4826 |  3868 |  2753 |

            |   300 |   200 |    64 |    63 |    16 |
    --------+-------+-------+-------+-------+-------+
    Before  |  1742 |  1428 |   856 |   535 |   383 |
    After   |  1940 |  1534 |   940 |   573 |   383 |

Change-Id: I583dd6b48b81ab3c6df51bfe8729366cad500537
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/74368
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/build.json b/build.json
index 4d3fb4e..71489c9 100644
--- a/build.json
+++ b/build.json
@@ -141,6 +141,7 @@
         "perlasm_x86_64": [
             {"src": "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl"},
             {"src": "crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl"},
+            {"src": "crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl"},
             {"src": "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"},
             {"src": "crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl"},
             {"src": "crypto/fipsmodule/modes/asm/ghash-x86_64.pl"},
diff --git a/crypto/crypto.cc b/crypto/crypto.cc
index 912a993..ac0928f 100644
--- a/crypto/crypto.cc
+++ b/crypto/crypto.cc
@@ -54,7 +54,7 @@
 // archive, linking on OS X will fail to resolve common symbols. By
 // initialising it to zero, it becomes a "data symbol", which isn't so
 // affected.
-HIDDEN uint8_t BORINGSSL_function_hit[8] = {0};
+HIDDEN uint8_t BORINGSSL_function_hit[9] = {0};
 #endif
 
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
diff --git a/crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl b/crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl
new file mode 100644
index 0000000..6ea956b
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl
@@ -0,0 +1,1027 @@
+#!/usr/bin/env perl
+# Copyright 2024 The BoringSSL Authors
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+#------------------------------------------------------------------------------
+#
+# VAES and VPCLMULQDQ optimized AES-GCM for x86_64 (AVX2 version)
+#
+# This is similar to aes-gcm-avx10-x86_64.pl, but it uses AVX2 instead of AVX512
+# / AVX10.  This means it can only use 16 vector registers instead of 32, the
+# maximum vector length is 32 bytes, and some instructions such as vpternlogd
+# and masked loads/stores are unavailable.  However, it is able to run on CPUs
+# that have VAES without AVX512 / AVX10, namely AMD Zen 3 (including "Milan"
+# server processors) and some Intel client CPUs such as Alder Lake.
+#
+# This implementation also uses Karatsuba multiplication instead of schoolbook
+# multiplication for GHASH in its main loop.  This does not help much on Intel,
+# but it improves performance by ~5% on AMD Zen 3 which is the main target for
+# this implementation.  Other factors weighing slightly in favor of Karatsuba
+# multiplication in this implementation are the lower maximum vector length
+# (which means there is space left in the Htable array to cache the halves of
+# the key powers XOR'd together) and the unavailability of the vpternlogd
+# instruction (which helped schoolbook a bit more than Karatsuba).
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; }
+
+my $win64;
+my @argregs;
+if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) {
+    $win64   = 1;
+    @argregs = ( "%rcx", "%rdx", "%r8", "%r9" );
+}
+else {
+    $win64   = 0;
+    @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" );
+}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir = $1;
+my $xlate;
+( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate )
+  or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate )
+  or die "can't locate x86_64-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+my $g_cur_func_name;
+my $g_cur_func_uses_seh;
+my @g_cur_func_saved_gpregs;
+my @g_cur_func_saved_xmmregs;
+
+sub _begin_func {
+    my ( $funcname, $uses_seh ) = @_;
+    $g_cur_func_name          = $funcname;
+    $g_cur_func_uses_seh      = $uses_seh;
+    @g_cur_func_saved_gpregs  = ();
+    @g_cur_func_saved_xmmregs = ();
+    return <<___;
+.globl $funcname
+.type $funcname,\@abi-omnipotent
+.align 32
+$funcname:
+    .cfi_startproc
+    @{[ $uses_seh ? ".seh_startproc" : "" ]}
+    _CET_ENDBR
+___
+}
+
+# Push a list of general purpose registers onto the stack.
+sub _save_gpregs {
+    my @gpregs = @_;
+    my $code   = "";
+    die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh;
+    die "_save_gpregs can only be called once per function"
+      if @g_cur_func_saved_gpregs;
+    die "Order must be _save_gpregs, then _save_xmmregs"
+      if @g_cur_func_saved_xmmregs;
+    @g_cur_func_saved_gpregs = @gpregs;
+    for my $reg (@gpregs) {
+        $code .= "push $reg\n";
+        if ($win64) {
+            $code .= ".seh_pushreg $reg\n";
+        }
+        else {
+            $code .= ".cfi_push $reg\n";
+        }
+    }
+    return $code;
+}
+
+# Push a list of xmm registers onto the stack if the target is Windows.
+sub _save_xmmregs {
+    my @xmmregs     = @_;
+    my $num_xmmregs = scalar @xmmregs;
+    my $code        = "";
+    die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh;
+    die "_save_xmmregs can only be called once per function"
+      if @g_cur_func_saved_xmmregs;
+    if ( $win64 and $num_xmmregs > 0 ) {
+        @g_cur_func_saved_xmmregs = @xmmregs;
+        my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
+        my $alloc_size    = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 );
+        $code .= "sub \$$alloc_size, %rsp\n";
+        $code .= ".seh_stackalloc $alloc_size\n";
+        for my $i ( 0 .. $num_xmmregs - 1 ) {
+            my $reg_num = $xmmregs[$i];
+            my $pos     = 16 * $i;
+            $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n";
+            $code .= ".seh_savexmm %xmm$reg_num, $pos\n";
+        }
+    }
+    return $code;
+}
+
+sub _end_func {
+    my $code = "";
+
+    # Restore any xmm registers that were saved earlier.
+    my $num_xmmregs = scalar @g_cur_func_saved_xmmregs;
+    if ( $win64 and $num_xmmregs > 0 ) {
+        my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
+        my $alloc_size     = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 );
+        for my $i ( 0 .. $num_xmmregs - 1 ) {
+            my $reg_num = $g_cur_func_saved_xmmregs[$i];
+            my $pos     = 16 * $i;
+            $code .= "movdqa $pos(%rsp), %xmm$reg_num\n";
+        }
+        $code .= "add \$$alloc_size, %rsp\n";
+    }
+
+    # Restore any general purpose registers that were saved earlier.
+    for my $reg ( reverse @g_cur_func_saved_gpregs ) {
+        $code .= "pop $reg\n";
+        if ( !$win64 ) {
+            $code .= ".cfi_pop $reg\n";
+        }
+    }
+
+    $code .= <<___;
+    ret
+    @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]}
+    .cfi_endproc
+    .size   $g_cur_func_name, . - $g_cur_func_name
+___
+    return $code;
+}
+
+my $code = <<___;
+.section .rodata
+.align 16
+
+    # A shuffle mask that reflects the bytes of 16-byte blocks
+.Lbswap_mask:
+    .quad   0x08090a0b0c0d0e0f, 0x0001020304050607
+
+    # This is the GHASH reducing polynomial without its constant term, i.e.
+    # x^128 + x^7 + x^2 + x, represented using the backwards mapping
+    # between bits and polynomial coefficients.
+    #
+    # Alternatively, it can be interpreted as the naturally-ordered
+    # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+    # "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+    .quad   1, 0xc200000000000000
+
+    # Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+    .quad   1, 0xc200000000000001
+
+.align 32
+    # The below constants are used for incrementing the counter blocks.
+.Lctr_pattern:
+    .quad   0, 0
+    .quad   1, 0
+.Linc_2blocks:
+    .quad   2, 0
+    .quad   2, 0
+
+.text
+___
+
+# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the
+# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication)
+# in the order 8,6,7,5,4,2,3,1.  We do not use Htable[12..15].
+my $NUM_H_POWERS            = 8;
+my $OFFSETOFEND_H_POWERS    = $NUM_H_POWERS * 16;
+my $OFFSETOF_H_POWERS_XORED = $OFFSETOFEND_H_POWERS;
+
+# Offset to 'rounds' in AES_KEY struct
+my $OFFSETOF_AES_ROUNDS = 240;
+
+# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+# the reduced products in \dst.  Uses schoolbook multiplication.
+sub _ghash_mul {
+    my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
+    return <<___;
+    vpclmulqdq      \$0x00, $a, $b, $t0        # LO = a_L * b_L
+    vpclmulqdq      \$0x01, $a, $b, $t1        # MI_0 = a_L * b_H
+    vpclmulqdq      \$0x10, $a, $b, $t2        # MI_1 = a_H * b_L
+    vpxor           $t2, $t1, $t1              # MI = MI_0 + MI_1
+    vpclmulqdq      \$0x01, $t0, $gfpoly, $t2  # LO_L*(x^63 + x^62 + x^57)
+    vpshufd         \$0x4e, $t0, $t0           # Swap halves of LO
+    vpxor           $t0, $t1, $t1              # Fold LO into MI (part 1)
+    vpxor           $t2, $t1, $t1              # Fold LO into MI (part 2)
+    vpclmulqdq      \$0x11, $a, $b, $dst       # HI = a_H * b_H
+    vpclmulqdq      \$0x01, $t1, $gfpoly, $t0  # MI_L*(x^63 + x^62 + x^57)
+    vpshufd         \$0x4e, $t1, $t1           # Swap halves of MI
+    vpxor           $t1, $dst, $dst            # Fold MI into HI (part 1)
+    vpxor           $t0, $dst, $dst            # Fold MI into HI (part 2)
+___
+}
+
+# void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]);
+#
+# Initialize |Htable| with powers of the GHASH subkey |H|.
+#
+# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the
+# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication)
+# in the order 8,6,7,5,4,2,3,1.  We do not use Htable[12..15].
+$code .= _begin_func "gcm_init_vpclmulqdq_avx2", 1;
+{
+    my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ];
+    my ( $TMP0,   $TMP0_XMM )   = ( "%ymm0", "%xmm0" );
+    my ( $TMP1,   $TMP1_XMM )   = ( "%ymm1", "%xmm1" );
+    my ( $TMP2,   $TMP2_XMM )   = ( "%ymm2", "%xmm2" );
+    my ( $H_CUR,  $H_CUR_XMM )  = ( "%ymm3", "%xmm3" );
+    my ( $H_CUR2, $H_CUR2_XMM ) = ( "%ymm4", "%xmm4" );
+    my ( $H_INC,  $H_INC_XMM )  = ( "%ymm5", "%xmm5" );
+    my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm6", "%xmm6" );
+
+    $code .= <<___;
+    @{[ _save_xmmregs (6) ]}
+    .seh_endprologue
+
+    # Load the byte-reflected hash subkey.  BoringSSL provides it in
+    # byte-reflected form except the two halves are in the wrong order.
+    vpshufd         \$0x4e, ($H_PTR), $H_CUR_XMM
+
+    # Finish preprocessing the byte-reflected hash subkey by multiplying it by
+    # x^-1 ("standard" interpretation of polynomial coefficients) or
+    # equivalently x^1 (natural interpretation).  This gets the key into a
+    # format that avoids having to bit-reflect the data blocks later.
+    vpshufd         \$0xd3, $H_CUR_XMM, $TMP0_XMM
+    vpsrad          \$31, $TMP0_XMM, $TMP0_XMM
+    vpaddq          $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM
+    vpand           .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $TMP0_XMM
+    vpxor           $TMP0_XMM, $H_CUR_XMM, $H_CUR_XMM
+
+    vbroadcasti128  .Lgfpoly(%rip), $GFPOLY
+
+    # Square H^1 to get H^2.
+    @{[ _ghash_mul  $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM,
+                    $TMP0_XMM, $TMP1_XMM, $TMP2_XMM ]}
+
+    # Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2].
+    vinserti128     \$1, $H_CUR_XMM, $H_INC, $H_CUR
+    vinserti128     \$1, $H_INC_XMM, $H_INC, $H_INC
+
+    # Compute H_CUR2 = [H^4, H^3].
+    @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
+
+    # Store [H^2, H^1] and [H^4, H^3].
+    vmovdqu         $H_CUR, 3*32($HTABLE)
+    vmovdqu         $H_CUR2, 2*32($HTABLE)
+
+    # For Karatsuba multiplication: compute and store the two 64-bit halves of
+    # each key power XOR'd together.  Order is 4,2,3,1.
+    vpunpcklqdq     $H_CUR, $H_CUR2, $TMP0
+    vpunpckhqdq     $H_CUR, $H_CUR2, $TMP1
+    vpxor           $TMP1, $TMP0, $TMP0
+    vmovdqu         $TMP0, $OFFSETOF_H_POWERS_XORED+32($HTABLE)
+
+    # Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7].
+    @{[ _ghash_mul  $H_INC, $H_CUR2, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
+    @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
+    vmovdqu         $H_CUR, 1*32($HTABLE)
+    vmovdqu         $H_CUR2, 0*32($HTABLE)
+
+    # Again, compute and store the two 64-bit halves of each key power XOR'd
+    # together.  Order is 8,6,7,5.
+    vpunpcklqdq     $H_CUR, $H_CUR2, $TMP0
+    vpunpckhqdq     $H_CUR, $H_CUR2, $TMP1
+    vpxor           $TMP1, $TMP0, $TMP0
+    vmovdqu         $TMP0, $OFFSETOF_H_POWERS_XORED($HTABLE)
+
+    vzeroupper
+___
+}
+$code .= _end_func;
+
+# Do one step of the GHASH update of four vectors of data blocks.
+#   $i: the step to do, 0 through 9
+#   $ghashdata_ptr: pointer to the data blocks (ciphertext or AAD)
+#   $htable: pointer to the Htable for the key
+#   $bswap_mask: mask for reflecting the bytes of blocks
+#   $h_pow[2-1]_xored: XOR'd key powers cached from Htable
+#   $tmp[0-2]: temporary registers.  $tmp[1-2] must be preserved across steps.
+#   $lo, $mi: working state for this macro that must be preserved across steps
+#   $ghash_acc: the GHASH accumulator (input/output)
+sub _ghash_step_4x {
+    my (
+        $i,            $ghashdata_ptr, $htable, $bswap_mask,
+        $h_pow2_xored, $h_pow1_xored,  $tmp0,   $tmp0_xmm,
+        $tmp1,         $tmp2,          $lo,     $mi,
+        $ghash_acc,    $ghash_acc_xmm
+    ) = @_;
+    my ( $hi, $hi_xmm ) = ( $ghash_acc, $ghash_acc_xmm );    # alias
+    if ( $i == 0 ) {
+        return <<___;
+        # First vector
+        vmovdqu         0*32($ghashdata_ptr), $tmp1
+        vpshufb         $bswap_mask, $tmp1, $tmp1
+        vmovdqu         0*32($htable), $tmp2
+        vpxor           $ghash_acc, $tmp1, $tmp1
+        vpclmulqdq      \$0x00, $tmp2, $tmp1, $lo
+        vpclmulqdq      \$0x11, $tmp2, $tmp1, $hi
+        vpunpckhqdq     $tmp1, $tmp1, $tmp0
+        vpxor           $tmp1, $tmp0, $tmp0
+        vpclmulqdq      \$0x00, $h_pow2_xored, $tmp0, $mi
+___
+    }
+    elsif ( $i == 1 ) {
+        return <<___;
+___
+    }
+    elsif ( $i == 2 ) {
+        return <<___;
+        # Second vector
+        vmovdqu         1*32($ghashdata_ptr), $tmp1
+        vpshufb         $bswap_mask, $tmp1, $tmp1
+        vmovdqu         1*32($htable), $tmp2
+        vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $lo, $lo
+        vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $hi, $hi
+        vpunpckhqdq     $tmp1, $tmp1, $tmp0
+        vpxor           $tmp1, $tmp0, $tmp0
+        vpclmulqdq      \$0x10, $h_pow2_xored, $tmp0, $tmp0
+        vpxor           $tmp0, $mi, $mi
+___
+    }
+    elsif ( $i == 3 ) {
+        return <<___;
+        # Third vector
+        vmovdqu         2*32($ghashdata_ptr), $tmp1
+        vpshufb         $bswap_mask, $tmp1, $tmp1
+        vmovdqu         2*32($htable), $tmp2
+___
+    }
+    elsif ( $i == 4 ) {
+        return <<___;
+        vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $lo, $lo
+        vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $hi, $hi
+___
+    }
+    elsif ( $i == 5 ) {
+        return <<___;
+        vpunpckhqdq     $tmp1, $tmp1, $tmp0
+        vpxor           $tmp1, $tmp0, $tmp0
+        vpclmulqdq      \$0x00, $h_pow1_xored, $tmp0, $tmp0
+        vpxor           $tmp0, $mi, $mi
+
+        # Fourth vector
+        vmovdqu         3*32($ghashdata_ptr), $tmp1
+        vpshufb         $bswap_mask, $tmp1, $tmp1
+___
+    }
+    elsif ( $i == 6 ) {
+        return <<___;
+        vmovdqu         3*32($htable), $tmp2
+        vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $lo, $lo
+        vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
+        vpxor           $tmp0, $hi, $hi
+        vpunpckhqdq     $tmp1, $tmp1, $tmp0
+        vpxor           $tmp1, $tmp0, $tmp0
+        vpclmulqdq      \$0x10, $h_pow1_xored, $tmp0, $tmp0
+        vpxor           $tmp0, $mi, $mi
+___
+    }
+    elsif ( $i == 7 ) {
+        return <<___;
+        # Finalize 'mi' following Karatsuba multiplication.
+        vpxor           $lo, $mi, $mi
+        vpxor           $hi, $mi, $mi
+
+        # Fold lo into mi.
+        vbroadcasti128  .Lgfpoly(%rip), $tmp2
+        vpclmulqdq      \$0x01, $lo, $tmp2, $tmp0
+        vpshufd         \$0x4e, $lo, $lo
+        vpxor           $lo, $mi, $mi
+        vpxor           $tmp0, $mi, $mi
+___
+    }
+    elsif ( $i == 8 ) {
+        return <<___;
+        # Fold mi into hi.
+        vpclmulqdq      \$0x01, $mi, $tmp2, $tmp0
+        vpshufd         \$0x4e, $mi, $mi
+        vpxor           $mi, $hi, $hi
+        vpxor           $tmp0, $hi, $hi
+___
+    }
+    elsif ( $i == 9 ) {
+        return <<___;
+        vextracti128    \$1, $hi, $tmp0_xmm
+        vpxor           $tmp0_xmm, $hi_xmm, $ghash_acc_xmm
+___
+    }
+}
+
+sub _ghash_4x {
+    my $code = "";
+    for my $i ( 0 .. 9 ) {
+        $code .= _ghash_step_4x $i, @_;
+    }
+    return $code;
+}
+
+# void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]);
+$code .= _begin_func "gcm_gmult_vpclmulqdq_avx2", 1;
+{
+    my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ];
+    my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
+      map( "%xmm$_", ( 0 .. 6 ) );
+
+    $code .= <<___;
+    @{[ _save_xmmregs (6) ]}
+    .seh_endprologue
+
+    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC
+    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK
+    vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
+    vmovdqu         .Lgfpoly(%rip), $GFPOLY
+    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
+
+    @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}
+
+    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
+    vmovdqu         $GHASH_ACC, ($GHASH_ACC_PTR)
+___
+}
+$code .= _end_func;
+
+# void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16],
+#                                const uint8_t *in, size_t len);
+#
+# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
+# by |in| and |len|.  |len| must be a multiple of 16.
+#
+# This function handles large amounts of AAD efficiently, while also keeping the
+# overhead low for small amounts of AAD which is the common case.  TLS uses less
+# than one block of AAD, but (uncommonly) other use cases may use much more.
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2", 1;
+{
+    # Function arguments
+    my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
+
+    # Additional local variables
+    my ( $TMP0,       $TMP0_XMM )       = ( "%ymm0", "%xmm0" );
+    my ( $TMP1,       $TMP1_XMM )       = ( "%ymm1", "%xmm1" );
+    my ( $TMP2,       $TMP2_XMM )       = ( "%ymm2", "%xmm2" );
+    my ( $LO,         $LO_XMM )         = ( "%ymm3", "%xmm3" );
+    my ( $MI,         $MI_XMM )         = ( "%ymm4", "%xmm4" );
+    my ( $GHASH_ACC,  $GHASH_ACC_XMM )  = ( "%ymm5", "%xmm5" );
+    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" );
+    my ( $GFPOLY,     $GFPOLY_XMM )     = ( "%ymm7", "%xmm7" );
+    my $H_POW2_XORED = "%ymm8";
+    my $H_POW1_XORED = "%ymm9";
+
+    $code .= <<___;
+    @{[ _save_xmmregs (6 .. 9) ]}
+    .seh_endprologue
+
+    vbroadcasti128  .Lbswap_mask(%rip), $BSWAP_MASK
+    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vbroadcasti128  .Lgfpoly(%rip), $GFPOLY
+
+    # Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128.
+    cmp             \$32, $AADLEN
+    jb              .Lghash_lastblock
+
+    cmp             \$127, $AADLEN
+    jbe             .Lghash_loop_1x
+
+    # Update GHASH with 128 bytes of AAD at a time.
+    vmovdqu         $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED
+    vmovdqu         $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED
+.Lghash_loop_4x:
+    @{[ _ghash_4x   $AAD, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, $H_POW1_XORED,
+                    $TMP0, $TMP0_XMM, $TMP1, $TMP2, $LO, $MI, $GHASH_ACC,
+                    $GHASH_ACC_XMM ]}
+    sub             \$-128, $AAD  # 128 is 4 bytes, -128 is 1 byte
+    add             \$-128, $AADLEN
+    cmp             \$127, $AADLEN
+    ja              .Lghash_loop_4x
+
+    # Update GHASH with 32 bytes of AAD at a time.
+    cmp             \$32, $AADLEN
+    jb              .Lghash_loop_1x_done
+.Lghash_loop_1x:
+    vmovdqu         ($AAD), $TMP0
+    vpshufb         $BSWAP_MASK, $TMP0, $TMP0
+    vpxor           $TMP0, $GHASH_ACC, $GHASH_ACC
+    vmovdqu         $OFFSETOFEND_H_POWERS-32($HTABLE), $TMP0
+    @{[ _ghash_mul  $TMP0, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $TMP1, $TMP2, $LO ]}
+    vextracti128    \$1, $GHASH_ACC, $TMP0_XMM
+    vpxor           $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    add             \$32, $AAD
+    sub             \$32, $AADLEN
+    cmp             \$32, $AADLEN
+    jae             .Lghash_loop_1x
+.Lghash_loop_1x_done:
+    # Issue the vzeroupper that is needed after using ymm registers.  Do it here
+    # instead of at the end, to minimize overhead for small AADLEN.
+    vzeroupper
+
+    # Update GHASH with the remaining 16-byte block if any.
+.Lghash_lastblock:
+    test            $AADLEN, $AADLEN
+    jz              .Lghash_done
+    vmovdqu         ($AAD), $TMP0_XMM
+    vpshufb         $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM
+    vpxor           $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM
+    @{[ _ghash_mul  $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
+                    $TMP1_XMM, $TMP2_XMM, $LO_XMM ]}
+
+.Lghash_done:
+    # Store the updated GHASH accumulator back to memory.
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+___
+}
+$code .= _end_func;
+
+sub _vaesenc_4x {
+    my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_;
+    return <<___;
+    vaesenc         $round_key, $aesdata0, $aesdata0
+    vaesenc         $round_key, $aesdata1, $aesdata1
+    vaesenc         $round_key, $aesdata2, $aesdata2
+    vaesenc         $round_key, $aesdata3, $aesdata3
+___
+}
+
+sub _ctr_begin_4x {
+    my (
+        $le_ctr,   $bswap_mask, $rndkey0,  $aesdata0,
+        $aesdata1, $aesdata2,   $aesdata3, $tmp
+    ) = @_;
+    return <<___;
+    # Increment le_ctr four times to generate four vectors of little-endian
+    # counter blocks, swap each to big-endian, and store them in aesdata[0-3].
+    vmovdqu         .Linc_2blocks(%rip), $tmp
+    vpshufb         $bswap_mask, $le_ctr, $aesdata0
+    vpaddd          $tmp, $le_ctr, $le_ctr
+    vpshufb         $bswap_mask, $le_ctr, $aesdata1
+    vpaddd          $tmp, $le_ctr, $le_ctr
+    vpshufb         $bswap_mask, $le_ctr, $aesdata2
+    vpaddd          $tmp, $le_ctr, $le_ctr
+    vpshufb         $bswap_mask, $le_ctr, $aesdata3
+    vpaddd          $tmp, $le_ctr, $le_ctr
+
+    # AES "round zero": XOR in the zero-th round key.
+    vpxor           $rndkey0, $aesdata0, $aesdata0
+    vpxor           $rndkey0, $aesdata1, $aesdata1
+    vpxor           $rndkey0, $aesdata2, $aesdata2
+    vpxor           $rndkey0, $aesdata3, $aesdata3
+___
+}
+
+# Do the last AES round for four vectors of counter blocks, XOR four vectors of
+# source data with the resulting keystream blocks, and write the result to the
+# destination buffer.  The implementation differs slightly as it takes advantage
+# of the property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) to reduce
+# latency, but it has the same effect.
+sub _aesenclast_and_xor_4x {
+    my (
+        $src,      $dst,      $rndkeylast, $aesdata0,
+        $aesdata1, $aesdata2, $aesdata3,   $t0,
+        $t1,       $t2,       $t3
+    ) = @_;
+    return <<___;
+    vpxor           0*32($src), $rndkeylast, $t0
+    vpxor           1*32($src), $rndkeylast, $t1
+    vpxor           2*32($src), $rndkeylast, $t2
+    vpxor           3*32($src), $rndkeylast, $t3
+    vaesenclast     $t0, $aesdata0, $aesdata0
+    vaesenclast     $t1, $aesdata1, $aesdata1
+    vaesenclast     $t2, $aesdata2, $aesdata2
+    vaesenclast     $t3, $aesdata3, $aesdata3
+    vmovdqu         $aesdata0, 0*32($dst)
+    vmovdqu         $aesdata1, 1*32($dst)
+    vmovdqu         $aesdata2, 2*32($dst)
+    vmovdqu         $aesdata3, 3*32($dst)
+___
+}
+
+my $g_update_macro_expansion_count = 0;
+
+# void aes_gcm_{enc,dec}_update_vaes_avx2(const uint8_t *in, uint8_t *out,
+#                                         size_t len, const AES_KEY *key,
+#                                         const uint8_t ivec[16],
+#                                         const u128 Htable[16],
+#                                         uint8_t Xi[16]);
+#
+# This macro generates a GCM encryption or decryption update function with the
+# above prototype (with \enc selecting which one).  The function computes the
+# next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and
+# writes the resulting encrypted or decrypted data to |out|.  It also updates
+# the GHASH accumulator |Xi| using the next |len| ciphertext bytes.
+#
+# |len| must be a multiple of 16.  The caller must do any buffering needed to
+# ensure this.  Both in-place and out-of-place en/decryption are supported.
+#
+# |ivec| must give the current counter in big-endian format.  This function
+# loads the counter from |ivec| and increments the loaded counter as needed, but
+# it does *not* store the updated counter back to |ivec|.  The caller must
+# update |ivec| if any more data segments follow.  Internally, only the low
+# 32-bit word of the counter is incremented, following the GCM standard.
+sub _aes_gcm_update {
+    my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count;
+    my ($enc)              = @_;
+    my $code               = "";
+
+    # Function arguments
+    my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR )
+      = $win64
+      ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" )
+      : ( @argregs[ 0 .. 5 ], "%r12" );
+
+    # Additional local variables.
+    # %rax is used as a temporary register.  BE_CTR_PTR is also available as a
+    # temporary register after the counter is loaded.
+
+    # AES key length in bytes
+    my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" );
+
+    # Pointer to the last AES round key for the chosen AES variant
+    my $RNDKEYLAST_PTR = "%r11";
+
+    # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+    # using vpshufb, copied to all 128-bit lanes.
+    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm0", "%xmm0" );
+
+    # GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
+    # only the lowest 128-bit lane can be nonzero.  When not fully reduced,
+    # more than one lane may be used, and they need to be XOR'd together.
+    my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm1", "%xmm1" );
+
+    # TMP[0-2] are temporary registers.
+    my ( $TMP0, $TMP0_XMM ) = ( "%ymm2", "%xmm2" );
+    my ( $TMP1, $TMP1_XMM ) = ( "%ymm3", "%xmm3" );
+    my ( $TMP2, $TMP2_XMM ) = ( "%ymm4", "%xmm4" );
+
+    # LO and MI are used to accumulate unreduced GHASH products.
+    my ( $LO, $LO_XMM ) = ( "%ymm5", "%xmm5" );
+    my ( $MI, $MI_XMM ) = ( "%ymm6", "%xmm6" );
+
+    # Cached key powers from Htable
+    my ( $H_POW2_XORED, $H_POW2_XORED_XMM ) = ( "%ymm7", "%xmm7" );
+    my ( $H_POW1_XORED, $H_POW1_XORED_XMM ) = ( "%ymm8", "%xmm8" );
+
+    # RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one.
+    my $RNDKEY0    = "%ymm9";
+    my $RNDKEYLAST = "%ymm10";
+
+    # LE_CTR contains the next set of little-endian counter blocks.
+    my $LE_CTR = "%ymm11";
+
+    # AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
+    my ( $AESDATA0, $AESDATA0_XMM ) = ( "%ymm12", "%xmm12" );
+    my ( $AESDATA1, $AESDATA1_XMM ) = ( "%ymm13", "%xmm13" );
+    my ( $AESDATA2, $AESDATA2_XMM ) = ( "%ymm14", "%xmm14" );
+    my ( $AESDATA3, $AESDATA3_XMM ) = ( "%ymm15", "%xmm15" );
+    my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 );
+
+    my @ghash_4x_args = (
+        $enc ? $DST : $SRC, $HTABLE, $BSWAP_MASK, $H_POW2_XORED,
+        $H_POW1_XORED,      $TMP0,   $TMP0_XMM,   $TMP1,
+        $TMP2,              $LO,     $MI,         $GHASH_ACC,
+        $GHASH_ACC_XMM
+    );
+
+    if ($win64) {
+        $code .= <<___;
+        @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]}
+        mov             64(%rsp), $BE_CTR_PTR     # arg5
+        mov             72(%rsp), $HTABLE         # arg6
+        mov             80(%rsp), $GHASH_ACC_PTR  # arg7
+        @{[ _save_xmmregs (6 .. 15) ]}
+        .seh_endprologue
+___
+    }
+    else {
+        $code .= <<___;
+        @{[ _save_gpregs $GHASH_ACC_PTR ]}
+        mov             16(%rsp), $GHASH_ACC_PTR  # arg7
+___
+    }
+
+    if ($enc) {
+        $code .= <<___;
+#ifdef BORINGSSL_DISPATCH_TEST
+        .extern BORINGSSL_function_hit
+        movb \$1,BORINGSSL_function_hit+8(%rip)
+#endif
+___
+    }
+    $code .= <<___;
+    vbroadcasti128  .Lbswap_mask(%rip), $BSWAP_MASK
+
+    # Load the GHASH accumulator and the starting counter.
+    # BoringSSL passes these values in big endian format.
+    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vbroadcasti128  ($BE_CTR_PTR), $LE_CTR
+    vpshufb         $BSWAP_MASK, $LE_CTR, $LE_CTR
+
+    # Load the AES key length in bytes.  BoringSSL stores number of rounds
+    # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20.
+    movl            $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN
+    lea             -20(,$AESKEYLEN,4), $AESKEYLEN
+
+    # Make RNDKEYLAST_PTR point to the last AES round key.  This is the
+    # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+    # respectively.  Then load the zero-th and last round keys.
+    lea             6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR
+    vbroadcasti128  ($AESKEY), $RNDKEY0
+    vbroadcasti128  ($RNDKEYLAST_PTR), $RNDKEYLAST
+
+    # Finish initializing LE_CTR by adding 1 to the second block.
+    vpaddd          .Lctr_pattern(%rip), $LE_CTR, $LE_CTR
+
+    # If there are at least 128 bytes of data, then continue into the loop that
+    # processes 128 bytes of data at a time.  Otherwise skip it.
+    cmp             \$127, $DATALEN
+    jbe             .Lcrypt_loop_4x_done$local_label_suffix
+
+    vmovdqu         $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED
+    vmovdqu         $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED
+___
+
+    # Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time.
+
+    if ($enc) {
+        $code .= <<___;
+        # Encrypt the first 4 vectors of plaintext blocks.
+        @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]}
+        lea             16($AESKEY), %rax
+.Lvaesenc_loop_first_4_vecs$local_label_suffix:
+        vbroadcasti128  (%rax), $TMP0
+        @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+        add             \$16, %rax
+        cmp             %rax, $RNDKEYLAST_PTR
+        jne             .Lvaesenc_loop_first_4_vecs$local_label_suffix
+        @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA,
+                                   $TMP0, $TMP1, $LO, $MI ]}
+        sub             \$-128, $SRC  # 128 is 4 bytes, -128 is 1 byte
+        add             \$-128, $DATALEN
+        cmp             \$127, $DATALEN
+        jbe             .Lghash_last_ciphertext_4x$local_label_suffix
+___
+    }
+
+    $code .= <<___;
+.align 16
+.Lcrypt_loop_4x$local_label_suffix:
+
+    # Start the AES encryption of the counter blocks.
+    @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]}
+    cmp             \$24, $AESKEYLEN
+    jl              .Laes128$local_label_suffix
+    je              .Laes192$local_label_suffix
+    # AES-256
+    vbroadcasti128 -13*16($RNDKEYLAST_PTR), $TMP0
+    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+    vbroadcasti128 -12*16($RNDKEYLAST_PTR), $TMP0
+    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+.Laes192$local_label_suffix:
+    vbroadcasti128 -11*16($RNDKEYLAST_PTR), $TMP0
+    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+    vbroadcasti128 -10*16($RNDKEYLAST_PTR), $TMP0
+    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+.Laes128$local_label_suffix:
+___
+
+    # Finish the AES encryption of the counter blocks in AESDATA[0-3],
+    # interleaved with the GHASH update of the ciphertext blocks.
+    for my $i ( reverse 1 .. 9 ) {
+        $code .= <<___;
+        @{[ _ghash_step_4x 9-$i, @ghash_4x_args ]}
+        vbroadcasti128  -$i*16($RNDKEYLAST_PTR), $TMP0
+        @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+___
+    }
+    $code .= <<___;
+    @{[ _ghash_step_4x 9, @ghash_4x_args ]}
+
+    @{[ $enc ? "sub \$-128, $DST" : "" ]}  # 128 is 4 bytes, -128 is 1 byte
+    @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA,
+                               $TMP0, $TMP1, $LO, $MI ]}
+    sub             \$-128, $SRC
+    @{[ !$enc ? "sub \$-128, $DST" : "" ]}
+    add             \$-128, $DATALEN
+    cmp             \$127, $DATALEN
+    ja              .Lcrypt_loop_4x$local_label_suffix
+___
+
+    if ($enc) {
+
+        # Update GHASH with the last set of ciphertext blocks.
+        $code .= <<___;
+.Lghash_last_ciphertext_4x$local_label_suffix:
+        @{[ _ghash_4x @ghash_4x_args ]}
+        sub             \$-128, $DST
+___
+    }
+
+    my $POWERS_PTR = $BE_CTR_PTR;    # BE_CTR_PTR is free to be reused.
+    my ( $HI, $HI_XMM ) = ( $H_POW2_XORED, $H_POW2_XORED_XMM );    # reuse
+
+    $code .= <<___;
+.Lcrypt_loop_4x_done$local_label_suffix:
+    # Check whether any data remains.
+    test            $DATALEN, $DATALEN
+    jz              .Ldone$local_label_suffix
+
+    # DATALEN is in [16, 32, 48, 64, 80, 96, 112].
+
+    # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+    # is the number of blocks that remain.
+    lea             $OFFSETOFEND_H_POWERS($HTABLE), $POWERS_PTR
+    sub             $DATALEN, $POWERS_PTR
+
+    # Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+    vpxor           $LO_XMM, $LO_XMM, $LO_XMM
+    vpxor           $MI_XMM, $MI_XMM, $MI_XMM
+    vpxor           $HI_XMM, $HI_XMM, $HI_XMM
+
+    cmp             \$64, $DATALEN
+    jb              .Llessthan64bytes$local_label_suffix
+
+    # DATALEN is in [64, 80, 96, 112].  Encrypt two vectors of counter blocks.
+    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA0
+    vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
+    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA1
+    vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
+    vpxor           $RNDKEY0, $AESDATA0, $AESDATA0
+    vpxor           $RNDKEY0, $AESDATA1, $AESDATA1
+    lea             16($AESKEY), %rax
+.Lvaesenc_loop_tail_1$local_label_suffix:
+    vbroadcasti128  (%rax), $TMP0
+    vaesenc         $TMP0, $AESDATA0, $AESDATA0
+    vaesenc         $TMP0, $AESDATA1, $AESDATA1
+    add             \$16, %rax
+    cmp             %rax, $RNDKEYLAST_PTR
+    jne             .Lvaesenc_loop_tail_1$local_label_suffix
+    vaesenclast     $RNDKEYLAST, $AESDATA0, $AESDATA0
+    vaesenclast     $RNDKEYLAST, $AESDATA1, $AESDATA1
+
+    # XOR the data with the two vectors of keystream blocks.
+    vmovdqu         0($SRC), $TMP0
+    vmovdqu         32($SRC), $TMP1
+    vpxor           $TMP0, $AESDATA0, $AESDATA0
+    vpxor           $TMP1, $AESDATA1, $AESDATA1
+    vmovdqu         $AESDATA0, 0($DST)
+    vmovdqu         $AESDATA1, 32($DST)
+
+    # Update GHASH with two vectors of ciphertext blocks, without reducing.
+    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
+    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA1 : $TMP1 ]}, $AESDATA1
+    vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
+    vmovdqu         ($POWERS_PTR), $TMP0
+    vmovdqu         32($POWERS_PTR), $TMP1
+    vpclmulqdq      \$0x00, $TMP0, $AESDATA0, $LO
+    vpclmulqdq      \$0x01, $TMP0, $AESDATA0, $MI
+    vpclmulqdq      \$0x10, $TMP0, $AESDATA0, $TMP2
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x11, $TMP0, $AESDATA0, $HI
+    vpclmulqdq      \$0x00, $TMP1, $AESDATA1, $TMP2
+    vpxor           $TMP2, $LO, $LO
+    vpclmulqdq      \$0x01, $TMP1, $AESDATA1, $TMP2
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x10, $TMP1, $AESDATA1, $TMP2
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x11, $TMP1, $AESDATA1, $TMP2
+    vpxor           $TMP2, $HI, $HI
+
+    add             \$64, $POWERS_PTR
+    add             \$64, $SRC
+    add             \$64, $DST
+    sub             \$64, $DATALEN
+    jz              .Lreduce$local_label_suffix
+
+    vpxor           $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+
+    # DATALEN is in [16, 32, 48].  Encrypt two last vectors of counter blocks.
+.Llessthan64bytes$local_label_suffix:
+    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA0
+    vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
+    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA1
+    vpxor           $RNDKEY0, $AESDATA0, $AESDATA0
+    vpxor           $RNDKEY0, $AESDATA1, $AESDATA1
+    lea             16($AESKEY), %rax
+.Lvaesenc_loop_tail_2$local_label_suffix:
+    vbroadcasti128  (%rax), $TMP0
+    vaesenc         $TMP0, $AESDATA0, $AESDATA0
+    vaesenc         $TMP0, $AESDATA1, $AESDATA1
+    add             \$16, %rax
+    cmp             %rax, $RNDKEYLAST_PTR
+    jne             .Lvaesenc_loop_tail_2$local_label_suffix
+    vaesenclast     $RNDKEYLAST, $AESDATA0, $AESDATA0
+    vaesenclast     $RNDKEYLAST, $AESDATA1, $AESDATA1
+
+    # XOR the remaining data with the keystream blocks, and update GHASH with
+    # the remaining ciphertext blocks without reducing.
+
+    cmp             \$32, $DATALEN
+    jb              .Lxor_one_block$local_label_suffix
+    je              .Lxor_two_blocks$local_label_suffix
+
+.Lxor_three_blocks$local_label_suffix:
+    vmovdqu         0($SRC), $TMP0
+    vmovdqu         32($SRC), $TMP1_XMM
+    vpxor           $TMP0, $AESDATA0, $AESDATA0
+    vpxor           $TMP1_XMM, $AESDATA1_XMM, $AESDATA1_XMM
+    vmovdqu         $AESDATA0, 0($DST)
+    vmovdqu         $AESDATA1_XMM, 32($DST)
+
+    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
+    vpshufb         $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA1_XMM : $TMP1_XMM ]}, $AESDATA1_XMM
+    vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
+    vmovdqu         ($POWERS_PTR), $TMP0
+    vmovdqu         32($POWERS_PTR), $TMP1_XMM
+    vpclmulqdq      \$0x00, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+    vpxor           $TMP2, $LO, $LO
+    vpclmulqdq      \$0x01, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x10, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x11, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+    vpxor           $TMP2, $HI, $HI
+    jmp             .Lghash_mul_one_vec_unreduced$local_label_suffix
+
+.Lxor_two_blocks$local_label_suffix:
+    vmovdqu         ($SRC), $TMP0
+    vpxor           $TMP0, $AESDATA0, $AESDATA0
+    vmovdqu         $AESDATA0, ($DST)
+    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
+    vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
+    vmovdqu         ($POWERS_PTR), $TMP0
+    jmp             .Lghash_mul_one_vec_unreduced$local_label_suffix
+
+.Lxor_one_block$local_label_suffix:
+    vmovdqu         ($SRC), $TMP0_XMM
+    vpxor           $TMP0_XMM, $AESDATA0_XMM, $AESDATA0_XMM
+    vmovdqu         $AESDATA0_XMM, ($DST)
+    vpshufb         $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA0_XMM : $TMP0_XMM ]}, $AESDATA0_XMM
+    vpxor           $GHASH_ACC_XMM, $AESDATA0_XMM, $AESDATA0_XMM
+    vmovdqu         ($POWERS_PTR), $TMP0_XMM
+
+.Lghash_mul_one_vec_unreduced$local_label_suffix:
+    vpclmulqdq      \$0x00, $TMP0, $AESDATA0, $TMP2
+    vpxor           $TMP2, $LO, $LO
+    vpclmulqdq      \$0x01, $TMP0, $AESDATA0, $TMP2
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x10, $TMP0, $AESDATA0, $TMP2
+    vpxor           $TMP2, $MI, $MI
+    vpclmulqdq      \$0x11, $TMP0, $AESDATA0, $TMP2
+    vpxor           $TMP2, $HI, $HI
+
+.Lreduce$local_label_suffix:
+    # Finally, do the GHASH reduction.
+    vbroadcasti128  .Lgfpoly(%rip), $TMP0
+    vpclmulqdq      \$0x01, $LO, $TMP0, $TMP1
+    vpshufd         \$0x4e, $LO, $LO
+    vpxor           $LO, $MI, $MI
+    vpxor           $TMP1, $MI, $MI
+    vpclmulqdq      \$0x01, $MI, $TMP0, $TMP1
+    vpshufd         \$0x4e, $MI, $MI
+    vpxor           $MI, $HI, $HI
+    vpxor           $TMP1, $HI, $HI
+    vextracti128    \$1, $HI, $GHASH_ACC_XMM
+    vpxor           $HI_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+
+.Ldone$local_label_suffix:
+    # Store the updated GHASH accumulator back to memory.
+    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper
+___
+    return $code;
+}
+
+$code .= _begin_func "aes_gcm_enc_update_vaes_avx2", 1;
+$code .= _aes_gcm_update 1;
+$code .= _end_func;
+
+$code .= _begin_func "aes_gcm_dec_update_vaes_avx2", 1;
+$code .= _aes_gcm_update 0;
+$code .= _end_func;
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
+exit 0;
diff --git a/crypto/fipsmodule/modes/gcm.cc.inc b/crypto/fipsmodule/modes/gcm.cc.inc
index d3c829a..e77c525 100644
--- a/crypto/fipsmodule/modes/gcm.cc.inc
+++ b/crypto/fipsmodule/modes/gcm.cc.inc
@@ -99,6 +99,11 @@
                              uint8_t Xi[16], const u128 Htable[16],
                              enum gcm_impl_t impl) {
   switch (impl) {
+    case gcm_x86_vaes_avx2:
+      len &= kSizeTWithoutLower4Bits;
+      aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec, Htable, Xi);
+      CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+      return len;
     case gcm_x86_vaes_avx10_256:
       len &= kSizeTWithoutLower4Bits;
       aes_gcm_enc_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi);
@@ -119,6 +124,11 @@
                              uint8_t Xi[16], const u128 Htable[16],
                              enum gcm_impl_t impl) {
   switch (impl) {
+    case gcm_x86_vaes_avx2:
+      len &= kSizeTWithoutLower4Bits;
+      aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec, Htable, Xi);
+      CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+      return len;
     case gcm_x86_vaes_avx10_256:
       len &= kSizeTWithoutLower4Bits;
       aes_gcm_dec_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi);
@@ -171,15 +181,21 @@
 
 #if defined(GHASH_ASM_X86_64)
   if (crypto_gcm_clmul_enabled()) {
-    if (CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
-        CRYPTO_is_VPCLMULQDQ_capable() && CRYPTO_is_BMI2_capable()) {
-      gcm_init_vpclmulqdq_avx10(out_table, H);
-      *out_mult = gcm_gmult_vpclmulqdq_avx10;
-      if (CRYPTO_cpu_avoid_zmm_registers()) {
-        *out_hash = gcm_ghash_vpclmulqdq_avx10_256;
-      } else {
-        *out_hash = gcm_ghash_vpclmulqdq_avx10_512;
+    if (CRYPTO_is_VPCLMULQDQ_capable() && CRYPTO_is_AVX2_capable()) {
+      if (CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
+          CRYPTO_is_BMI2_capable()) {
+        gcm_init_vpclmulqdq_avx10(out_table, H);
+        *out_mult = gcm_gmult_vpclmulqdq_avx10;
+        if (CRYPTO_cpu_avoid_zmm_registers()) {
+          *out_hash = gcm_ghash_vpclmulqdq_avx10_256;
+        } else {
+          *out_hash = gcm_ghash_vpclmulqdq_avx10_512;
+        }
+        return;
       }
+      gcm_init_vpclmulqdq_avx2(out_table, H);
+      *out_mult = gcm_gmult_vpclmulqdq_avx2;
+      *out_hash = gcm_ghash_vpclmulqdq_avx2;
       return;
     }
     if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
@@ -265,6 +281,9 @@
   } else if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx10_512 &&
              CRYPTO_is_VAES_capable()) {
     gcm_key->impl = gcm_x86_vaes_avx10_512;
+  } else if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx2 &&
+             CRYPTO_is_VAES_capable()) {
+    gcm_key->impl = gcm_x86_vaes_avx2;
   } else if (gcm_key->ghash == gcm_ghash_avx && is_hwaes) {
     gcm_key->impl = gcm_x86_aesni;
   }
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index fafde9c..d195526 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -82,6 +82,29 @@
       }
     }
     if (CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() &&
+        CRYPTO_is_AVX2_capable()) {
+      AES_KEY aes_key;
+      static const uint8_t kKey[16] = {0};
+      uint8_t iv[16] = {0};
+
+      CHECK_ABI_SEH(gcm_init_vpclmulqdq_avx2, Htable, kH);
+      CHECK_ABI_SEH(gcm_gmult_vpclmulqdq_avx2, X, Htable);
+      for (size_t blocks : kBlockCounts) {
+        CHECK_ABI_SEH(gcm_ghash_vpclmulqdq_avx2, X, Htable, buf, 16 * blocks);
+      }
+
+      aes_hw_set_encrypt_key(kKey, 128, &aes_key);
+      for (size_t blocks : kBlockCounts) {
+        CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx2, buf, buf, blocks * 16,
+                      &aes_key, iv, Htable, X);
+      }
+      aes_hw_set_decrypt_key(kKey, 128, &aes_key);
+      for (size_t blocks : kBlockCounts) {
+        CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx2, buf, buf, blocks * 16,
+                      &aes_key, iv, Htable, X);
+      }
+    }
+    if (CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() &&
         CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
         CRYPTO_is_BMI2_capable()) {
       AES_KEY aes_key;
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index a1f7bf5..f041bf8 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -69,6 +69,7 @@
 enum gcm_impl_t {
   gcm_separate = 0,  // No combined AES-GCM, but may have AES-CTR and GHASH.
   gcm_x86_aesni,
+  gcm_x86_vaes_avx2,
   gcm_x86_vaes_avx10_256,
   gcm_x86_vaes_avx10_512,
   gcm_arm64_aes,
@@ -200,6 +201,17 @@
                          const AES_KEY *key, uint8_t ivec[16],
                          const u128 Htable[16], uint8_t Xi[16]);
 
+void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]);
+void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16],
+                               const uint8_t *in, size_t len);
+void aes_gcm_enc_update_vaes_avx2(const uint8_t *in, uint8_t *out, size_t len,
+                                  const AES_KEY *key, const uint8_t ivec[16],
+                                  const u128 Htable[16], uint8_t Xi[16]);
+void aes_gcm_dec_update_vaes_avx2(const uint8_t *in, uint8_t *out, size_t len,
+                                  const AES_KEY *key, const uint8_t ivec[16],
+                                  const u128 Htable[16], uint8_t Xi[16]);
+
 void gcm_init_vpclmulqdq_avx10(u128 Htable[16], const uint64_t H[2]);
 void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
 void gcm_ghash_vpclmulqdq_avx10_256(uint8_t Xi[16], const u128 Htable[16],
diff --git a/crypto/impl_dispatch_test.cc b/crypto/impl_dispatch_test.cc
index 8c8d1d1..bfd0045 100644
--- a/crypto/impl_dispatch_test.cc
+++ b/crypto/impl_dispatch_test.cc
@@ -37,8 +37,9 @@
     avx_movbe_ = CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable();
     ssse3_ = CRYPTO_is_SSSE3_capable();
     vaes_ = CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() &&
-            CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
-            CRYPTO_is_BMI2_capable();
+            CRYPTO_is_AVX2_capable();
+    avx10_ = CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
+             CRYPTO_is_BMI2_capable();
     avoid_zmm_ = CRYPTO_cpu_avoid_zmm_registers();
     is_x86_64_ =
 #if defined(OPENSSL_X86_64)
@@ -80,6 +81,7 @@
   bool ssse3_ = false;
   bool is_x86_64_ = false;
   bool vaes_ = false;
+  bool avx10_ = false;
   bool avoid_zmm_ = false;
 #endif
 };
@@ -95,6 +97,7 @@
 constexpr size_t kFlag_vpaes_set_encrypt_key = 5;
 constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_256 = 6;
 constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_512 = 7;
+constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx2 = 8;
 
 TEST_F(ImplDispatchTest, AEAD_AES_GCM) {
   AssertFunctionsHit(
@@ -107,9 +110,10 @@
           {kFlag_vpaes_encrypt, ssse3_ && !aesni_},
           {kFlag_vpaes_set_encrypt_key, ssse3_ && !aesni_},
           {kFlag_aes_gcm_enc_update_vaes_avx10_256,
-           is_x86_64_ && vaes_ && avoid_zmm_},
+           is_x86_64_ && vaes_ && avx10_ && avoid_zmm_},
           {kFlag_aes_gcm_enc_update_vaes_avx10_512,
-           is_x86_64_ && vaes_ && !avoid_zmm_},
+           is_x86_64_ && vaes_ && avx10_ && !avoid_zmm_},
+          {kFlag_aes_gcm_enc_update_vaes_avx2, is_x86_64_ && vaes_ && !avx10_},
       },
       [] {
         const uint8_t kZeros[16] = {0};
diff --git a/crypto/internal.h b/crypto/internal.h
index d50e755..62273c6 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -1410,7 +1410,8 @@
 //   5: vpaes_set_encrypt_key
 //   6: aes_gcm_enc_update_vaes_avx10_256
 //   7: aes_gcm_enc_update_vaes_avx10_512
-extern uint8_t BORINGSSL_function_hit[8];
+//   8: aes_gcm_enc_update_vaes_avx2
+extern uint8_t BORINGSSL_function_hit[9];
 #endif  // BORINGSSL_DISPATCH_TEST
 
 // OPENSSL_vasprintf_internal is just like |vasprintf(3)|. If |system_malloc| is
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-apple.S b/gen/bcm/aes-gcm-avx2-x86_64-apple.S
new file mode 100644
index 0000000..e401e66
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx2-x86_64-apple.S
@@ -0,0 +1,1309 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.section	__DATA,__const
+.p2align	4
+
+
+L$bswap_mask:
+.quad	0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+L$gfpoly:
+.quad	1, 0xc200000000000000
+
+
+L$gfpoly_and_internal_carrybit:
+.quad	1, 0xc200000000000001
+
+.p2align	5
+
+L$ctr_pattern:
+.quad	0, 0
+.quad	1, 0
+L$inc_2blocks:
+.quad	2, 0
+.quad	2, 0
+
+.text	
+.globl	_gcm_init_vpclmulqdq_avx2
+.private_extern _gcm_init_vpclmulqdq_avx2
+
+.p2align	5
+_gcm_init_vpclmulqdq_avx2:
+
+
+_CET_ENDBR
+
+
+
+
+
+	vpshufd	$0x4e,(%rsi),%xmm3
+
+
+
+
+
+	vpshufd	$0xd3,%xmm3,%xmm0
+	vpsrad	$31,%xmm0,%xmm0
+	vpaddq	%xmm3,%xmm3,%xmm3
+	vpand	L$gfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm6
+
+
+	vpclmulqdq	$0x00,%xmm3,%xmm3,%xmm0
+	vpclmulqdq	$0x01,%xmm3,%xmm3,%xmm1
+	vpclmulqdq	$0x10,%xmm3,%xmm3,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm6,%xmm2
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm3,%xmm5
+	vpclmulqdq	$0x01,%xmm1,%xmm6,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm5,%xmm5
+	vpxor	%xmm0,%xmm5,%xmm5
+
+
+
+	vinserti128	$1,%xmm3,%ymm5,%ymm3
+	vinserti128	$1,%xmm5,%ymm5,%ymm5
+
+
+	vpclmulqdq	$0x00,%ymm5,%ymm3,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm3,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm3,%ymm4
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+
+
+	vmovdqu	%ymm3,96(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128+32(%rdi)
+
+
+	vpclmulqdq	$0x00,%ymm5,%ymm4,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm4,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm4,%ymm3
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm3,%ymm3
+
+	vpclmulqdq	$0x00,%ymm5,%ymm3,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm3,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm3,%ymm4
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,0(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128(%rdi)
+
+	vzeroupper
+	ret
+
+
+
+.globl	_gcm_gmult_vpclmulqdq_avx2
+.private_extern _gcm_gmult_vpclmulqdq_avx2
+
+.p2align	5
+_gcm_gmult_vpclmulqdq_avx2:
+
+
+_CET_ENDBR
+
+
+
+	vmovdqu	(%rdi),%xmm0
+	vmovdqu	L$bswap_mask(%rip),%xmm1
+	vmovdqu	128-16(%rsi),%xmm2
+	vmovdqu	L$gfpoly(%rip),%xmm3
+	vpshufb	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm4
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm5
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm4,%xmm3,%xmm6
+	vpshufd	$0x4e,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x01,%xmm5,%xmm3,%xmm4
+	vpshufd	$0x4e,%xmm5,%xmm5
+	vpxor	%xmm5,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm0,%xmm0
+
+
+	vpshufb	%xmm1,%xmm0,%xmm0
+	vmovdqu	%xmm0,(%rdi)
+	ret
+
+
+
+.globl	_gcm_ghash_vpclmulqdq_avx2
+.private_extern _gcm_ghash_vpclmulqdq_avx2
+
+.p2align	5
+_gcm_ghash_vpclmulqdq_avx2:
+
+
+_CET_ENDBR
+
+
+
+	vbroadcasti128	L$bswap_mask(%rip),%ymm6
+	vmovdqu	(%rdi),%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vbroadcasti128	L$gfpoly(%rip),%ymm7
+
+
+	cmpq	$32,%rcx
+	jb	L$ghash_lastblock
+
+	cmpq	$127,%rcx
+	jbe	L$ghash_loop_1x
+
+
+	vmovdqu	128(%rsi),%ymm8
+	vmovdqu	128+32(%rsi),%ymm9
+L$ghash_loop_4x:
+
+	vmovdqu	0(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	0(%rsi),%ymm2
+	vpxor	%ymm5,%ymm1,%ymm1
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm8,%ymm0,%ymm4
+
+	vmovdqu	32(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	32(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x10,%ymm8,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vmovdqu	64(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	64(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm9,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+
+	vmovdqu	96(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	96(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x10,%ymm9,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpxor	%ymm5,%ymm4,%ymm4
+
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm3,%ymm2,%ymm0
+	vpshufd	$0x4e,%ymm3,%ymm3
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vpclmulqdq	$0x01,%ymm4,%ymm2,%ymm0
+	vpshufd	$0x4e,%ymm4,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm0,%ymm5,%ymm5
+	vextracti128	$1,%ymm5,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+
+	subq	$-128,%rdx
+	addq	$-128,%rcx
+	cmpq	$127,%rcx
+	ja	L$ghash_loop_4x
+
+
+	cmpq	$32,%rcx
+	jb	L$ghash_loop_1x_done
+L$ghash_loop_1x:
+	vmovdqu	(%rdx),%ymm0
+	vpshufb	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vmovdqu	128-32(%rsi),%ymm0
+	vpclmulqdq	$0x00,%ymm0,%ymm5,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm5,%ymm2
+	vpclmulqdq	$0x10,%ymm0,%ymm5,%ymm3
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x01,%ymm1,%ymm7,%ymm3
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x11,%ymm0,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm7,%ymm1
+	vpshufd	$0x4e,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm5,%ymm5
+
+	vextracti128	$1,%ymm5,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	addq	$32,%rdx
+	subq	$32,%rcx
+	cmpq	$32,%rcx
+	jae	L$ghash_loop_1x
+L$ghash_loop_1x_done:
+
+
+	vzeroupper
+
+
+L$ghash_lastblock:
+	testq	%rcx,%rcx
+	jz	L$ghash_done
+	vmovdqu	(%rdx),%xmm0
+	vpshufb	%xmm6,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	vmovdqu	128-16(%rsi),%xmm0
+	vpclmulqdq	$0x00,%xmm0,%xmm5,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm5,%xmm2
+	vpclmulqdq	$0x10,%xmm0,%xmm5,%xmm3
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x01,%xmm1,%xmm7,%xmm3
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm2,%xmm7,%xmm1
+	vpshufd	$0x4e,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm5,%xmm5
+
+
+L$ghash_done:
+
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rdi)
+	ret
+
+
+
+.globl	_aes_gcm_enc_update_vaes_avx2
+.private_extern _aes_gcm_enc_update_vaes_avx2
+
+.p2align	5
+_aes_gcm_enc_update_vaes_avx2:
+
+
+_CET_ENDBR
+	pushq	%r12
+
+
+	movq	16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	movb	$1,_BORINGSSL_function_hit+8(%rip)
+#endif
+	vbroadcasti128	L$bswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	L$ctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	L$crypt_loop_4x_done__func1
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+
+
+
+	vmovdqu	L$inc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_first_4_vecs__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_first_4_vecs__func1
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	jbe	L$ghash_last_ciphertext_4x__func1
+.p2align	4
+L$crypt_loop_4x__func1:
+
+
+
+
+	vmovdqu	L$inc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	L$aes128__func1
+	je	L$aes192__func1
+
+	vbroadcasti128	-208(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-192(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+L$aes192__func1:
+	vbroadcasti128	-176(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-160(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+L$aes128__func1:
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vbroadcasti128	-144(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+	subq	$-128,%rsi
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	L$crypt_loop_4x__func1
+L$ghash_last_ciphertext_4x__func1:
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	subq	$-128,%rsi
+L$crypt_loop_4x_done__func1:
+
+	testq	%rdx,%rdx
+	jz	L$done__func1
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	L$lessthan64bytes__func1
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_1__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_1__func1
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm0,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm7
+	vpclmulqdq	$0x00,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	L$reduce__func1
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+L$lessthan64bytes__func1:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_2__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_2__func1
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+
+
+	cmpq	$32,%rdx
+	jb	L$xor_one_block__func1
+	je	L$xor_two_blocks__func1
+
+L$xor_three_blocks__func1:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	L$ghash_mul_one_vec_unreduced__func1
+
+L$xor_two_blocks__func1:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	L$ghash_mul_one_vec_unreduced__func1
+
+L$xor_one_block__func1:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+L$ghash_mul_one_vec_unreduced__func1:
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+L$reduce__func1:
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm5,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpclmulqdq	$0x01,%ymm6,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+L$done__func1:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+
+	ret
+
+
+
+.globl	_aes_gcm_dec_update_vaes_avx2
+.private_extern _aes_gcm_dec_update_vaes_avx2
+
+.p2align	5
+_aes_gcm_dec_update_vaes_avx2:
+
+
+_CET_ENDBR
+	pushq	%r12
+
+
+	movq	16(%rsp),%r12
+	vbroadcasti128	L$bswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	L$ctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	L$crypt_loop_4x_done__func2
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+.p2align	4
+L$crypt_loop_4x__func2:
+
+
+
+
+	vmovdqu	L$inc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	L$aes128__func2
+	je	L$aes192__func2
+
+	vbroadcasti128	-208(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-192(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+L$aes192__func2:
+	vbroadcasti128	-176(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-160(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+L$aes128__func2:
+
+	vmovdqu	0(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vbroadcasti128	-144(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	32(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	64(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	L$crypt_loop_4x__func2
+L$crypt_loop_4x_done__func2:
+
+	testq	%rdx,%rdx
+	jz	L$done__func2
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	L$lessthan64bytes__func2
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_1__func2:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_1__func2
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%ymm0,%ymm3,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm7
+	vpclmulqdq	$0x00,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	L$reduce__func2
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+L$lessthan64bytes__func2:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	L$inc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+L$vaesenc_loop_tail_2__func2:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	L$vaesenc_loop_tail_2__func2
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+
+
+	cmpq	$32,%rdx
+	jb	L$xor_one_block__func2
+	je	L$xor_two_blocks__func2
+
+L$xor_three_blocks__func2:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%xmm0,%xmm3,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	L$ghash_mul_one_vec_unreduced__func2
+
+L$xor_two_blocks__func2:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	L$ghash_mul_one_vec_unreduced__func2
+
+L$xor_one_block__func2:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm2,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+L$ghash_mul_one_vec_unreduced__func2:
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+L$reduce__func2:
+
+	vbroadcasti128	L$gfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm5,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpclmulqdq	$0x01,%ymm6,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+L$done__func2:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+
+	ret
+
+
+
+#endif
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-linux.S b/gen/bcm/aes-gcm-avx2-x86_64-linux.S
new file mode 100644
index 0000000..b7816cf
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx2-x86_64-linux.S
@@ -0,0 +1,1314 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.section	.rodata
+.align	16
+
+
+.Lbswap_mask:
+.quad	0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+.Lgfpoly:
+.quad	1, 0xc200000000000000
+
+
+.Lgfpoly_and_internal_carrybit:
+.quad	1, 0xc200000000000001
+
+.align	32
+
+.Lctr_pattern:
+.quad	0, 0
+.quad	1, 0
+.Linc_2blocks:
+.quad	2, 0
+.quad	2, 0
+
+.text	
+.globl	gcm_init_vpclmulqdq_avx2
+.hidden gcm_init_vpclmulqdq_avx2
+.type	gcm_init_vpclmulqdq_avx2,@function
+.align	32
+gcm_init_vpclmulqdq_avx2:
+.cfi_startproc	
+
+_CET_ENDBR
+
+
+
+
+
+	vpshufd	$0x4e,(%rsi),%xmm3
+
+
+
+
+
+	vpshufd	$0xd3,%xmm3,%xmm0
+	vpsrad	$31,%xmm0,%xmm0
+	vpaddq	%xmm3,%xmm3,%xmm3
+	vpand	.Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0
+	vpxor	%xmm0,%xmm3,%xmm3
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm6
+
+
+	vpclmulqdq	$0x00,%xmm3,%xmm3,%xmm0
+	vpclmulqdq	$0x01,%xmm3,%xmm3,%xmm1
+	vpclmulqdq	$0x10,%xmm3,%xmm3,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm6,%xmm2
+	vpshufd	$0x4e,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm3,%xmm5
+	vpclmulqdq	$0x01,%xmm1,%xmm6,%xmm0
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm5,%xmm5
+	vpxor	%xmm0,%xmm5,%xmm5
+
+
+
+	vinserti128	$1,%xmm3,%ymm5,%ymm3
+	vinserti128	$1,%xmm5,%ymm5,%ymm5
+
+
+	vpclmulqdq	$0x00,%ymm5,%ymm3,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm3,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm3,%ymm4
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+
+
+	vmovdqu	%ymm3,96(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128+32(%rdi)
+
+
+	vpclmulqdq	$0x00,%ymm5,%ymm4,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm4,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm4,%ymm3
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm3,%ymm3
+
+	vpclmulqdq	$0x00,%ymm5,%ymm3,%ymm0
+	vpclmulqdq	$0x01,%ymm5,%ymm3,%ymm1
+	vpclmulqdq	$0x10,%ymm5,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm6,%ymm2
+	vpshufd	$0x4e,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpclmulqdq	$0x11,%ymm5,%ymm3,%ymm4
+	vpclmulqdq	$0x01,%ymm1,%ymm6,%ymm0
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,0(%rdi)
+
+
+
+	vpunpcklqdq	%ymm3,%ymm4,%ymm0
+	vpunpckhqdq	%ymm3,%ymm4,%ymm1
+	vpxor	%ymm1,%ymm0,%ymm0
+	vmovdqu	%ymm0,128(%rdi)
+
+	vzeroupper
+	ret
+
+.cfi_endproc	
+.size	gcm_init_vpclmulqdq_avx2, . - gcm_init_vpclmulqdq_avx2
+.globl	gcm_gmult_vpclmulqdq_avx2
+.hidden gcm_gmult_vpclmulqdq_avx2
+.type	gcm_gmult_vpclmulqdq_avx2,@function
+.align	32
+gcm_gmult_vpclmulqdq_avx2:
+.cfi_startproc	
+
+_CET_ENDBR
+
+
+
+	vmovdqu	(%rdi),%xmm0
+	vmovdqu	.Lbswap_mask(%rip),%xmm1
+	vmovdqu	128-16(%rsi),%xmm2
+	vmovdqu	.Lgfpoly(%rip),%xmm3
+	vpshufb	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm4
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm5
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm4,%xmm3,%xmm6
+	vpshufd	$0x4e,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x01,%xmm5,%xmm3,%xmm4
+	vpshufd	$0x4e,%xmm5,%xmm5
+	vpxor	%xmm5,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm0,%xmm0
+
+
+	vpshufb	%xmm1,%xmm0,%xmm0
+	vmovdqu	%xmm0,(%rdi)
+	ret
+
+.cfi_endproc	
+.size	gcm_gmult_vpclmulqdq_avx2, . - gcm_gmult_vpclmulqdq_avx2
+.globl	gcm_ghash_vpclmulqdq_avx2
+.hidden gcm_ghash_vpclmulqdq_avx2
+.type	gcm_ghash_vpclmulqdq_avx2,@function
+.align	32
+gcm_ghash_vpclmulqdq_avx2:
+.cfi_startproc	
+
+_CET_ENDBR
+
+
+
+	vbroadcasti128	.Lbswap_mask(%rip),%ymm6
+	vmovdqu	(%rdi),%xmm5
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vbroadcasti128	.Lgfpoly(%rip),%ymm7
+
+
+	cmpq	$32,%rcx
+	jb	.Lghash_lastblock
+
+	cmpq	$127,%rcx
+	jbe	.Lghash_loop_1x
+
+
+	vmovdqu	128(%rsi),%ymm8
+	vmovdqu	128+32(%rsi),%ymm9
+.Lghash_loop_4x:
+
+	vmovdqu	0(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	0(%rsi),%ymm2
+	vpxor	%ymm5,%ymm1,%ymm1
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm8,%ymm0,%ymm4
+
+	vmovdqu	32(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	32(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x10,%ymm8,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vmovdqu	64(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	64(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x00,%ymm9,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+
+	vmovdqu	96(%rdx),%ymm1
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vmovdqu	96(%rsi),%ymm2
+	vpclmulqdq	$0x00,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vpclmulqdq	$0x11,%ymm2,%ymm1,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vpunpckhqdq	%ymm1,%ymm1,%ymm0
+	vpxor	%ymm1,%ymm0,%ymm0
+	vpclmulqdq	$0x10,%ymm9,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpxor	%ymm5,%ymm4,%ymm4
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm3,%ymm2,%ymm0
+	vpshufd	$0x4e,%ymm3,%ymm3
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpxor	%ymm0,%ymm4,%ymm4
+
+	vpclmulqdq	$0x01,%ymm4,%ymm2,%ymm0
+	vpshufd	$0x4e,%ymm4,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpxor	%ymm0,%ymm5,%ymm5
+	vextracti128	$1,%ymm5,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+
+	subq	$-128,%rdx
+	addq	$-128,%rcx
+	cmpq	$127,%rcx
+	ja	.Lghash_loop_4x
+
+
+	cmpq	$32,%rcx
+	jb	.Lghash_loop_1x_done
+.Lghash_loop_1x:
+	vmovdqu	(%rdx),%ymm0
+	vpshufb	%ymm6,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm5,%ymm5
+	vmovdqu	128-32(%rsi),%ymm0
+	vpclmulqdq	$0x00,%ymm0,%ymm5,%ymm1
+	vpclmulqdq	$0x01,%ymm0,%ymm5,%ymm2
+	vpclmulqdq	$0x10,%ymm0,%ymm5,%ymm3
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x01,%ymm1,%ymm7,%ymm3
+	vpshufd	$0x4e,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x11,%ymm0,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm7,%ymm1
+	vpshufd	$0x4e,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpxor	%ymm1,%ymm5,%ymm5
+
+	vextracti128	$1,%ymm5,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	addq	$32,%rdx
+	subq	$32,%rcx
+	cmpq	$32,%rcx
+	jae	.Lghash_loop_1x
+.Lghash_loop_1x_done:
+
+
+	vzeroupper
+
+
+.Lghash_lastblock:
+	testq	%rcx,%rcx
+	jz	.Lghash_done
+	vmovdqu	(%rdx),%xmm0
+	vpshufb	%xmm6,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm5,%xmm5
+	vmovdqu	128-16(%rsi),%xmm0
+	vpclmulqdq	$0x00,%xmm0,%xmm5,%xmm1
+	vpclmulqdq	$0x01,%xmm0,%xmm5,%xmm2
+	vpclmulqdq	$0x10,%xmm0,%xmm5,%xmm3
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x01,%xmm1,%xmm7,%xmm3
+	vpshufd	$0x4e,%xmm1,%xmm1
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpxor	%xmm3,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm2,%xmm7,%xmm1
+	vpshufd	$0x4e,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpxor	%xmm1,%xmm5,%xmm5
+
+
+.Lghash_done:
+
+	vpshufb	%xmm6,%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rdi)
+	ret
+
+.cfi_endproc	
+.size	gcm_ghash_vpclmulqdq_avx2, . - gcm_ghash_vpclmulqdq_avx2
+.globl	aes_gcm_enc_update_vaes_avx2
+.hidden aes_gcm_enc_update_vaes_avx2
+.type	aes_gcm_enc_update_vaes_avx2,@function
+.align	32
+aes_gcm_enc_update_vaes_avx2:
+.cfi_startproc	
+
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+8(%rip)
+#endif
+	vbroadcasti128	.Lbswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	.Lctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	.Lcrypt_loop_4x_done__func1
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_first_4_vecs__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_first_4_vecs__func1
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	jbe	.Lghash_last_ciphertext_4x__func1
+.align	16
+.Lcrypt_loop_4x__func1:
+
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func1
+	je	.Laes192__func1
+
+	vbroadcasti128	-208(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-192(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes192__func1:
+	vbroadcasti128	-176(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-160(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes128__func1:
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vbroadcasti128	-144(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+	subq	$-128,%rsi
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	.Lcrypt_loop_4x__func1
+.Lghash_last_ciphertext_4x__func1:
+
+	vmovdqu	0(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vmovdqu	32(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vmovdqu	64(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rsi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	subq	$-128,%rsi
+.Lcrypt_loop_4x_done__func1:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func1
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	.Llessthan64bytes__func1
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_1__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_1__func1
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm0,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm7
+	vpclmulqdq	$0x00,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	.Lreduce__func1
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+.Llessthan64bytes__func1:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_2__func1:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_2__func1
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+
+
+	cmpq	$32,%rdx
+	jb	.Lxor_one_block__func1
+	je	.Lxor_two_blocks__func1
+
+.Lxor_three_blocks__func1:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	.Lghash_mul_one_vec_unreduced__func1
+
+.Lxor_two_blocks__func1:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	.Lghash_mul_one_vec_unreduced__func1
+
+.Lxor_one_block__func1:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+.Lghash_mul_one_vec_unreduced__func1:
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+.Lreduce__func1:
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm5,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpclmulqdq	$0x01,%ymm6,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+.Ldone__func1:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+
+.cfi_endproc	
+.size	aes_gcm_enc_update_vaes_avx2, . - aes_gcm_enc_update_vaes_avx2
+.globl	aes_gcm_dec_update_vaes_avx2
+.hidden aes_gcm_dec_update_vaes_avx2
+.type	aes_gcm_dec_update_vaes_avx2,@function
+.align	32
+aes_gcm_dec_update_vaes_avx2:
+.cfi_startproc	
+
+_CET_ENDBR
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+
+	movq	16(%rsp),%r12
+	vbroadcasti128	.Lbswap_mask(%rip),%ymm0
+
+
+
+	vmovdqu	(%r12),%xmm1
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vbroadcasti128	(%r8),%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm11
+
+
+
+	movl	240(%rcx),%r10d
+	leal	-20(,%r10,4),%r10d
+
+
+
+
+	leaq	96(%rcx,%r10,4),%r11
+	vbroadcasti128	(%rcx),%ymm9
+	vbroadcasti128	(%r11),%ymm10
+
+
+	vpaddd	.Lctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+	cmpq	$127,%rdx
+	jbe	.Lcrypt_loop_4x_done__func2
+
+	vmovdqu	128(%r9),%ymm7
+	vmovdqu	128+32(%r9),%ymm8
+.align	16
+.Lcrypt_loop_4x__func2:
+
+
+
+
+	vmovdqu	.Linc_2blocks(%rip),%ymm2
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm14
+	vpaddd	%ymm2,%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm15
+	vpaddd	%ymm2,%ymm11,%ymm11
+
+
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	vpxor	%ymm9,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm15,%ymm15
+
+	cmpl	$24,%r10d
+	jl	.Laes128__func2
+	je	.Laes192__func2
+
+	vbroadcasti128	-208(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-192(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes192__func2:
+	vbroadcasti128	-176(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vbroadcasti128	-160(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+.Laes128__func2:
+
+	vmovdqu	0(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	0(%r9),%ymm4
+	vpxor	%ymm1,%ymm3,%ymm3
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm7,%ymm2,%ymm6
+
+	vbroadcasti128	-144(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vbroadcasti128	-128(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	32(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	32(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm7,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-112(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vmovdqu	64(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+	vmovdqu	64(%r9),%ymm4
+
+	vbroadcasti128	-96(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-80(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x00,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+
+	vmovdqu	96(%rdi),%ymm3
+	vpshufb	%ymm0,%ymm3,%ymm3
+
+	vbroadcasti128	-64(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vmovdqu	96(%r9),%ymm4
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm5,%ymm5
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vpunpckhqdq	%ymm3,%ymm3,%ymm2
+	vpxor	%ymm3,%ymm2,%ymm2
+	vpclmulqdq	$0x10,%ymm8,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-48(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm1,%ymm6,%ymm6
+
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm4
+	vpclmulqdq	$0x01,%ymm5,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm2,%ymm6,%ymm6
+
+	vbroadcasti128	-32(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+
+	vpclmulqdq	$0x01,%ymm6,%ymm4,%ymm2
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm1,%ymm1
+	vpxor	%ymm2,%ymm1,%ymm1
+
+	vbroadcasti128	-16(%r11),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	vaesenc	%ymm2,%ymm14,%ymm14
+	vaesenc	%ymm2,%ymm15,%ymm15
+
+	vextracti128	$1,%ymm1,%xmm2
+	vpxor	%xmm2,%xmm1,%xmm1
+
+
+
+	vpxor	0(%rdi),%ymm10,%ymm2
+	vpxor	32(%rdi),%ymm10,%ymm3
+	vpxor	64(%rdi),%ymm10,%ymm5
+	vpxor	96(%rdi),%ymm10,%ymm6
+	vaesenclast	%ymm2,%ymm12,%ymm12
+	vaesenclast	%ymm3,%ymm13,%ymm13
+	vaesenclast	%ymm5,%ymm14,%ymm14
+	vaesenclast	%ymm6,%ymm15,%ymm15
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+	vmovdqu	%ymm14,64(%rsi)
+	vmovdqu	%ymm15,96(%rsi)
+
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	addq	$-128,%rdx
+	cmpq	$127,%rdx
+	ja	.Lcrypt_loop_4x__func2
+.Lcrypt_loop_4x_done__func2:
+
+	testq	%rdx,%rdx
+	jz	.Ldone__func2
+
+
+
+
+
+	leaq	128(%r9),%r8
+	subq	%rdx,%r8
+
+
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpxor	%xmm6,%xmm6,%xmm6
+	vpxor	%xmm7,%xmm7,%xmm7
+
+	cmpq	$64,%rdx
+	jb	.Llessthan64bytes__func2
+
+
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_1__func2:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_1__func2
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%ymm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm13,%ymm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%ymm13,32(%rsi)
+
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%ymm0,%ymm3,%ymm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%ymm3
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm7
+	vpclmulqdq	$0x00,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm3,%ymm13,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+	addq	$64,%r8
+	addq	$64,%rdi
+	addq	$64,%rsi
+	subq	$64,%rdx
+	jz	.Lreduce__func2
+
+	vpxor	%xmm1,%xmm1,%xmm1
+
+
+.Llessthan64bytes__func2:
+	vpshufb	%ymm0,%ymm11,%ymm12
+	vpaddd	.Linc_2blocks(%rip),%ymm11,%ymm11
+	vpshufb	%ymm0,%ymm11,%ymm13
+	vpxor	%ymm9,%ymm12,%ymm12
+	vpxor	%ymm9,%ymm13,%ymm13
+	leaq	16(%rcx),%rax
+.Lvaesenc_loop_tail_2__func2:
+	vbroadcasti128	(%rax),%ymm2
+	vaesenc	%ymm2,%ymm12,%ymm12
+	vaesenc	%ymm2,%ymm13,%ymm13
+	addq	$16,%rax
+	cmpq	%rax,%r11
+	jne	.Lvaesenc_loop_tail_2__func2
+	vaesenclast	%ymm10,%ymm12,%ymm12
+	vaesenclast	%ymm10,%ymm13,%ymm13
+
+
+
+
+	cmpq	$32,%rdx
+	jb	.Lxor_one_block__func2
+	je	.Lxor_two_blocks__func2
+
+.Lxor_three_blocks__func2:
+	vmovdqu	0(%rdi),%ymm2
+	vmovdqu	32(%rdi),%xmm3
+	vpxor	%ymm2,%ymm12,%ymm12
+	vpxor	%xmm3,%xmm13,%xmm13
+	vmovdqu	%ymm12,0(%rsi)
+	vmovdqu	%xmm13,32(%rsi)
+
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpshufb	%xmm0,%xmm3,%xmm13
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	vmovdqu	32(%r8),%xmm3
+	vpclmulqdq	$0x00,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%xmm3,%xmm13,%xmm4
+	vpxor	%ymm4,%ymm7,%ymm7
+	jmp	.Lghash_mul_one_vec_unreduced__func2
+
+.Lxor_two_blocks__func2:
+	vmovdqu	(%rdi),%ymm2
+	vpxor	%ymm2,%ymm12,%ymm12
+	vmovdqu	%ymm12,(%rsi)
+	vpshufb	%ymm0,%ymm2,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm12
+	vmovdqu	(%r8),%ymm2
+	jmp	.Lghash_mul_one_vec_unreduced__func2
+
+.Lxor_one_block__func2:
+	vmovdqu	(%rdi),%xmm2
+	vpxor	%xmm2,%xmm12,%xmm12
+	vmovdqu	%xmm12,(%rsi)
+	vpshufb	%xmm0,%xmm2,%xmm12
+	vpxor	%xmm1,%xmm12,%xmm12
+	vmovdqu	(%r8),%xmm2
+
+.Lghash_mul_one_vec_unreduced__func2:
+	vpclmulqdq	$0x00,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm5,%ymm5
+	vpclmulqdq	$0x01,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x10,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm6,%ymm6
+	vpclmulqdq	$0x11,%ymm2,%ymm12,%ymm4
+	vpxor	%ymm4,%ymm7,%ymm7
+
+.Lreduce__func2:
+
+	vbroadcasti128	.Lgfpoly(%rip),%ymm2
+	vpclmulqdq	$0x01,%ymm5,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm5,%ymm5
+	vpxor	%ymm5,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpclmulqdq	$0x01,%ymm6,%ymm2,%ymm3
+	vpshufd	$0x4e,%ymm6,%ymm6
+	vpxor	%ymm6,%ymm7,%ymm7
+	vpxor	%ymm3,%ymm7,%ymm7
+	vextracti128	$1,%ymm7,%xmm1
+	vpxor	%xmm7,%xmm1,%xmm1
+
+.Ldone__func2:
+
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vmovdqu	%xmm1,(%r12)
+
+	vzeroupper
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	ret
+
+.cfi_endproc	
+.size	aes_gcm_dec_update_vaes_avx2, . - aes_gcm_dec_update_vaes_avx2
+#endif
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-win.asm b/gen/bcm/aes-gcm-avx2-x86_64-win.asm
new file mode 100644
index 0000000..9201553
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx2-x86_64-win.asm
@@ -0,0 +1,1588 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section	.rdata rdata align=8
+ALIGN	16
+
+
+$L$bswap_mask:
+	DQ	0x08090a0b0c0d0e0f,0x0001020304050607
+
+
+
+
+
+
+
+
+$L$gfpoly:
+	DQ	1,0xc200000000000000
+
+
+$L$gfpoly_and_internal_carrybit:
+	DQ	1,0xc200000000000001
+
+ALIGN	32
+
+$L$ctr_pattern:
+	DQ	0,0
+	DQ	1,0
+$L$inc_2blocks:
+	DQ	2,0
+	DQ	2,0
+
+section	.text code align=64
+
+global	gcm_init_vpclmulqdq_avx2
+
+ALIGN	32
+gcm_init_vpclmulqdq_avx2:
+
+$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1:
+_CET_ENDBR
+	sub	rsp,24
+$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3:
+
+$L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4:
+
+
+
+	vpshufd	xmm3,XMMWORD[rdx],0x4e
+
+
+
+
+
+	vpshufd	xmm0,xmm3,0xd3
+	vpsrad	xmm0,xmm0,31
+	vpaddq	xmm3,xmm3,xmm3
+	vpand	xmm0,xmm0,XMMWORD[$L$gfpoly_and_internal_carrybit]
+	vpxor	xmm3,xmm3,xmm0
+
+	vbroadcasti128	ymm6,XMMWORD[$L$gfpoly]
+
+
+	vpclmulqdq	xmm0,xmm3,xmm3,0x00
+	vpclmulqdq	xmm1,xmm3,xmm3,0x01
+	vpclmulqdq	xmm2,xmm3,xmm3,0x10
+	vpxor	xmm1,xmm1,xmm2
+	vpclmulqdq	xmm2,xmm6,xmm0,0x01
+	vpshufd	xmm0,xmm0,0x4e
+	vpxor	xmm1,xmm1,xmm0
+	vpxor	xmm1,xmm1,xmm2
+	vpclmulqdq	xmm5,xmm3,xmm3,0x11
+	vpclmulqdq	xmm0,xmm6,xmm1,0x01
+	vpshufd	xmm1,xmm1,0x4e
+	vpxor	xmm5,xmm5,xmm1
+	vpxor	xmm5,xmm5,xmm0
+
+
+
+	vinserti128	ymm3,ymm5,xmm3,1
+	vinserti128	ymm5,ymm5,xmm5,1
+
+
+	vpclmulqdq	ymm0,ymm3,ymm5,0x00
+	vpclmulqdq	ymm1,ymm3,ymm5,0x01
+	vpclmulqdq	ymm2,ymm3,ymm5,0x10
+	vpxor	ymm1,ymm1,ymm2
+	vpclmulqdq	ymm2,ymm6,ymm0,0x01
+	vpshufd	ymm0,ymm0,0x4e
+	vpxor	ymm1,ymm1,ymm0
+	vpxor	ymm1,ymm1,ymm2
+	vpclmulqdq	ymm4,ymm3,ymm5,0x11
+	vpclmulqdq	ymm0,ymm6,ymm1,0x01
+	vpshufd	ymm1,ymm1,0x4e
+	vpxor	ymm4,ymm4,ymm1
+	vpxor	ymm4,ymm4,ymm0
+
+
+
+	vmovdqu	YMMWORD[96+rcx],ymm3
+	vmovdqu	YMMWORD[64+rcx],ymm4
+
+
+
+	vpunpcklqdq	ymm0,ymm4,ymm3
+	vpunpckhqdq	ymm1,ymm4,ymm3
+	vpxor	ymm0,ymm0,ymm1
+	vmovdqu	YMMWORD[(128+32)+rcx],ymm0
+
+
+	vpclmulqdq	ymm0,ymm4,ymm5,0x00
+	vpclmulqdq	ymm1,ymm4,ymm5,0x01
+	vpclmulqdq	ymm2,ymm4,ymm5,0x10
+	vpxor	ymm1,ymm1,ymm2
+	vpclmulqdq	ymm2,ymm6,ymm0,0x01
+	vpshufd	ymm0,ymm0,0x4e
+	vpxor	ymm1,ymm1,ymm0
+	vpxor	ymm1,ymm1,ymm2
+	vpclmulqdq	ymm3,ymm4,ymm5,0x11
+	vpclmulqdq	ymm0,ymm6,ymm1,0x01
+	vpshufd	ymm1,ymm1,0x4e
+	vpxor	ymm3,ymm3,ymm1
+	vpxor	ymm3,ymm3,ymm0
+
+	vpclmulqdq	ymm0,ymm3,ymm5,0x00
+	vpclmulqdq	ymm1,ymm3,ymm5,0x01
+	vpclmulqdq	ymm2,ymm3,ymm5,0x10
+	vpxor	ymm1,ymm1,ymm2
+	vpclmulqdq	ymm2,ymm6,ymm0,0x01
+	vpshufd	ymm0,ymm0,0x4e
+	vpxor	ymm1,ymm1,ymm0
+	vpxor	ymm1,ymm1,ymm2
+	vpclmulqdq	ymm4,ymm3,ymm5,0x11
+	vpclmulqdq	ymm0,ymm6,ymm1,0x01
+	vpshufd	ymm1,ymm1,0x4e
+	vpxor	ymm4,ymm4,ymm1
+	vpxor	ymm4,ymm4,ymm0
+
+	vmovdqu	YMMWORD[32+rcx],ymm3
+	vmovdqu	YMMWORD[rcx],ymm4
+
+
+
+	vpunpcklqdq	ymm0,ymm4,ymm3
+	vpunpckhqdq	ymm1,ymm4,ymm3
+	vpxor	ymm0,ymm0,ymm1
+	vmovdqu	YMMWORD[128+rcx],ymm0
+
+	vzeroupper
+	movdqa	xmm6,XMMWORD[rsp]
+	add	rsp,24
+	ret
+$L$SEH_end_gcm_init_vpclmulqdq_avx2_5:
+
+
+global	gcm_gmult_vpclmulqdq_avx2
+
+ALIGN	32
+gcm_gmult_vpclmulqdq_avx2:
+
+$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1:
+_CET_ENDBR
+	sub	rsp,24
+$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_2:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_3:
+
+$L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx2_4:
+
+	vmovdqu	xmm0,XMMWORD[rcx]
+	vmovdqu	xmm1,XMMWORD[$L$bswap_mask]
+	vmovdqu	xmm2,XMMWORD[((128-16))+rdx]
+	vmovdqu	xmm3,XMMWORD[$L$gfpoly]
+	vpshufb	xmm0,xmm0,xmm1
+
+	vpclmulqdq	xmm4,xmm0,xmm2,0x00
+	vpclmulqdq	xmm5,xmm0,xmm2,0x01
+	vpclmulqdq	xmm6,xmm0,xmm2,0x10
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm6,xmm3,xmm4,0x01
+	vpshufd	xmm4,xmm4,0x4e
+	vpxor	xmm5,xmm5,xmm4
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm0,xmm0,xmm2,0x11
+	vpclmulqdq	xmm4,xmm3,xmm5,0x01
+	vpshufd	xmm5,xmm5,0x4e
+	vpxor	xmm0,xmm0,xmm5
+	vpxor	xmm0,xmm0,xmm4
+
+
+	vpshufb	xmm0,xmm0,xmm1
+	vmovdqu	XMMWORD[rcx],xmm0
+	movdqa	xmm6,XMMWORD[rsp]
+	add	rsp,24
+	ret
+$L$SEH_end_gcm_gmult_vpclmulqdq_avx2_5:
+
+
+global	gcm_ghash_vpclmulqdq_avx2
+
+ALIGN	32
+gcm_ghash_vpclmulqdq_avx2:
+
+$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1:
+_CET_ENDBR
+	sub	rsp,72
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_2:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_3:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_4:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_5:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_6:
+
+$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_7:
+
+	vbroadcasti128	ymm6,XMMWORD[$L$bswap_mask]
+	vmovdqu	xmm5,XMMWORD[rcx]
+	vpshufb	xmm5,xmm5,xmm6
+	vbroadcasti128	ymm7,XMMWORD[$L$gfpoly]
+
+
+	cmp	r9,32
+	jb	NEAR $L$ghash_lastblock
+
+	cmp	r9,127
+	jbe	NEAR $L$ghash_loop_1x
+
+
+	vmovdqu	ymm8,YMMWORD[128+rdx]
+	vmovdqu	ymm9,YMMWORD[((128+32))+rdx]
+$L$ghash_loop_4x:
+
+	vmovdqu	ymm1,YMMWORD[r8]
+	vpshufb	ymm1,ymm1,ymm6
+	vmovdqu	ymm2,YMMWORD[rdx]
+	vpxor	ymm1,ymm1,ymm5
+	vpclmulqdq	ymm3,ymm1,ymm2,0x00
+	vpclmulqdq	ymm5,ymm1,ymm2,0x11
+	vpunpckhqdq	ymm0,ymm1,ymm1
+	vpxor	ymm0,ymm0,ymm1
+	vpclmulqdq	ymm4,ymm0,ymm8,0x00
+
+	vmovdqu	ymm1,YMMWORD[32+r8]
+	vpshufb	ymm1,ymm1,ymm6
+	vmovdqu	ymm2,YMMWORD[32+rdx]
+	vpclmulqdq	ymm0,ymm1,ymm2,0x00
+	vpxor	ymm3,ymm3,ymm0
+	vpclmulqdq	ymm0,ymm1,ymm2,0x11
+	vpxor	ymm5,ymm5,ymm0
+	vpunpckhqdq	ymm0,ymm1,ymm1
+	vpxor	ymm0,ymm0,ymm1
+	vpclmulqdq	ymm0,ymm0,ymm8,0x10
+	vpxor	ymm4,ymm4,ymm0
+
+	vmovdqu	ymm1,YMMWORD[64+r8]
+	vpshufb	ymm1,ymm1,ymm6
+	vmovdqu	ymm2,YMMWORD[64+rdx]
+	vpclmulqdq	ymm0,ymm1,ymm2,0x00
+	vpxor	ymm3,ymm3,ymm0
+	vpclmulqdq	ymm0,ymm1,ymm2,0x11
+	vpxor	ymm5,ymm5,ymm0
+	vpunpckhqdq	ymm0,ymm1,ymm1
+	vpxor	ymm0,ymm0,ymm1
+	vpclmulqdq	ymm0,ymm0,ymm9,0x00
+	vpxor	ymm4,ymm4,ymm0
+
+
+	vmovdqu	ymm1,YMMWORD[96+r8]
+	vpshufb	ymm1,ymm1,ymm6
+	vmovdqu	ymm2,YMMWORD[96+rdx]
+	vpclmulqdq	ymm0,ymm1,ymm2,0x00
+	vpxor	ymm3,ymm3,ymm0
+	vpclmulqdq	ymm0,ymm1,ymm2,0x11
+	vpxor	ymm5,ymm5,ymm0
+	vpunpckhqdq	ymm0,ymm1,ymm1
+	vpxor	ymm0,ymm0,ymm1
+	vpclmulqdq	ymm0,ymm0,ymm9,0x10
+	vpxor	ymm4,ymm4,ymm0
+
+	vpxor	ymm4,ymm4,ymm3
+	vpxor	ymm4,ymm4,ymm5
+
+
+	vbroadcasti128	ymm2,XMMWORD[$L$gfpoly]
+	vpclmulqdq	ymm0,ymm2,ymm3,0x01
+	vpshufd	ymm3,ymm3,0x4e
+	vpxor	ymm4,ymm4,ymm3
+	vpxor	ymm4,ymm4,ymm0
+
+	vpclmulqdq	ymm0,ymm2,ymm4,0x01
+	vpshufd	ymm4,ymm4,0x4e
+	vpxor	ymm5,ymm5,ymm4
+	vpxor	ymm5,ymm5,ymm0
+	vextracti128	xmm0,ymm5,1
+	vpxor	xmm5,xmm5,xmm0
+
+	sub	r8,-128
+	add	r9,-128
+	cmp	r9,127
+	ja	NEAR $L$ghash_loop_4x
+
+
+	cmp	r9,32
+	jb	NEAR $L$ghash_loop_1x_done
+$L$ghash_loop_1x:
+	vmovdqu	ymm0,YMMWORD[r8]
+	vpshufb	ymm0,ymm0,ymm6
+	vpxor	ymm5,ymm5,ymm0
+	vmovdqu	ymm0,YMMWORD[((128-32))+rdx]
+	vpclmulqdq	ymm1,ymm5,ymm0,0x00
+	vpclmulqdq	ymm2,ymm5,ymm0,0x01
+	vpclmulqdq	ymm3,ymm5,ymm0,0x10
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm3,ymm7,ymm1,0x01
+	vpshufd	ymm1,ymm1,0x4e
+	vpxor	ymm2,ymm2,ymm1
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm5,ymm5,ymm0,0x11
+	vpclmulqdq	ymm1,ymm7,ymm2,0x01
+	vpshufd	ymm2,ymm2,0x4e
+	vpxor	ymm5,ymm5,ymm2
+	vpxor	ymm5,ymm5,ymm1
+
+	vextracti128	xmm0,ymm5,1
+	vpxor	xmm5,xmm5,xmm0
+	add	r8,32
+	sub	r9,32
+	cmp	r9,32
+	jae	NEAR $L$ghash_loop_1x
+$L$ghash_loop_1x_done:
+
+
+	vzeroupper
+
+
+$L$ghash_lastblock:
+	test	r9,r9
+	jz	NEAR $L$ghash_done
+	vmovdqu	xmm0,XMMWORD[r8]
+	vpshufb	xmm0,xmm0,xmm6
+	vpxor	xmm5,xmm5,xmm0
+	vmovdqu	xmm0,XMMWORD[((128-16))+rdx]
+	vpclmulqdq	xmm1,xmm5,xmm0,0x00
+	vpclmulqdq	xmm2,xmm5,xmm0,0x01
+	vpclmulqdq	xmm3,xmm5,xmm0,0x10
+	vpxor	xmm2,xmm2,xmm3
+	vpclmulqdq	xmm3,xmm7,xmm1,0x01
+	vpshufd	xmm1,xmm1,0x4e
+	vpxor	xmm2,xmm2,xmm1
+	vpxor	xmm2,xmm2,xmm3
+	vpclmulqdq	xmm5,xmm5,xmm0,0x11
+	vpclmulqdq	xmm1,xmm7,xmm2,0x01
+	vpshufd	xmm2,xmm2,0x4e
+	vpxor	xmm5,xmm5,xmm2
+	vpxor	xmm5,xmm5,xmm1
+
+
+$L$ghash_done:
+
+	vpshufb	xmm5,xmm5,xmm6
+	vmovdqu	XMMWORD[rcx],xmm5
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	add	rsp,72
+	ret
+$L$SEH_end_gcm_ghash_vpclmulqdq_avx2_8:
+
+
+global	aes_gcm_enc_update_vaes_avx2
+
+ALIGN	32
+aes_gcm_enc_update_vaes_avx2:
+
+$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1:
+_CET_ENDBR
+	push	rsi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2:
+	push	rdi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3:
+	push	r12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4:
+
+	mov	rsi,QWORD[64+rsp]
+	mov	rdi,QWORD[72+rsp]
+	mov	r12,QWORD[80+rsp]
+	sub	rsp,160
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9:
+	movdqa	XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10:
+	movdqa	XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11:
+	movdqa	XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12:
+	movdqa	XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13:
+	movdqa	XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14:
+	movdqa	XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15:
+
+$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16:
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN	BORINGSSL_function_hit
+	mov	BYTE[((BORINGSSL_function_hit+8))],1
+%endif
+	vbroadcasti128	ymm0,XMMWORD[$L$bswap_mask]
+
+
+
+	vmovdqu	xmm1,XMMWORD[r12]
+	vpshufb	xmm1,xmm1,xmm0
+	vbroadcasti128	ymm11,XMMWORD[rsi]
+	vpshufb	ymm11,ymm11,ymm0
+
+
+
+	mov	r10d,DWORD[240+r9]
+	lea	r10d,[((-20))+r10*4]
+
+
+
+
+	lea	r11,[96+r10*4+r9]
+	vbroadcasti128	ymm9,XMMWORD[r9]
+	vbroadcasti128	ymm10,XMMWORD[r11]
+
+
+	vpaddd	ymm11,ymm11,YMMWORD[$L$ctr_pattern]
+
+
+
+	cmp	r8,127
+	jbe	NEAR $L$crypt_loop_4x_done__func1
+
+	vmovdqu	ymm7,YMMWORD[128+rdi]
+	vmovdqu	ymm8,YMMWORD[((128+32))+rdi]
+
+
+
+	vmovdqu	ymm2,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm13,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm14,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm15,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+
+
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	vpxor	ymm14,ymm14,ymm9
+	vpxor	ymm15,ymm15,ymm9
+
+	lea	rax,[16+r9]
+$L$vaesenc_loop_first_4_vecs__func1:
+	vbroadcasti128	ymm2,XMMWORD[rax]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_first_4_vecs__func1
+	vpxor	ymm2,ymm10,YMMWORD[rcx]
+	vpxor	ymm3,ymm10,YMMWORD[32+rcx]
+	vpxor	ymm5,ymm10,YMMWORD[64+rcx]
+	vpxor	ymm6,ymm10,YMMWORD[96+rcx]
+	vaesenclast	ymm12,ymm12,ymm2
+	vaesenclast	ymm13,ymm13,ymm3
+	vaesenclast	ymm14,ymm14,ymm5
+	vaesenclast	ymm15,ymm15,ymm6
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	YMMWORD[32+rdx],ymm13
+	vmovdqu	YMMWORD[64+rdx],ymm14
+	vmovdqu	YMMWORD[96+rdx],ymm15
+
+	sub	rcx,-128
+	add	r8,-128
+	cmp	r8,127
+	jbe	NEAR $L$ghash_last_ciphertext_4x__func1
+ALIGN	16
+$L$crypt_loop_4x__func1:
+
+
+
+
+	vmovdqu	ymm2,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm13,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm14,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm15,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+
+
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	vpxor	ymm14,ymm14,ymm9
+	vpxor	ymm15,ymm15,ymm9
+
+	cmp	r10d,24
+	jl	NEAR $L$aes128__func1
+	je	NEAR $L$aes192__func1
+
+	vbroadcasti128	ymm2,XMMWORD[((-208))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-192))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+$L$aes192__func1:
+	vbroadcasti128	ymm2,XMMWORD[((-176))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-160))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+$L$aes128__func1:
+
+	vmovdqu	ymm3,YMMWORD[rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[rdi]
+	vpxor	ymm3,ymm3,ymm1
+	vpclmulqdq	ymm5,ymm3,ymm4,0x00
+	vpclmulqdq	ymm1,ymm3,ymm4,0x11
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm6,ymm2,ymm7,0x00
+
+	vbroadcasti128	ymm2,XMMWORD[((-144))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+
+	vbroadcasti128	ymm2,XMMWORD[((-128))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+
+	vmovdqu	ymm3,YMMWORD[32+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[32+rdi]
+	vpclmulqdq	ymm2,ymm3,ymm4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	vpclmulqdq	ymm2,ymm3,ymm4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm2,ymm2,ymm7,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-112))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+
+	vmovdqu	ymm3,YMMWORD[64+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[64+rdi]
+
+	vbroadcasti128	ymm2,XMMWORD[((-96))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vpclmulqdq	ymm2,ymm3,ymm4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	vpclmulqdq	ymm2,ymm3,ymm4,0x11
+	vpxor	ymm1,ymm1,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-80))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm2,ymm2,ymm8,0x00
+	vpxor	ymm6,ymm6,ymm2
+
+
+	vmovdqu	ymm3,YMMWORD[96+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+
+	vbroadcasti128	ymm2,XMMWORD[((-64))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vmovdqu	ymm4,YMMWORD[96+rdi]
+	vpclmulqdq	ymm2,ymm3,ymm4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	vpclmulqdq	ymm2,ymm3,ymm4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm2,ymm2,ymm8,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-48))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm1
+
+
+	vbroadcasti128	ymm4,XMMWORD[$L$gfpoly]
+	vpclmulqdq	ymm2,ymm4,ymm5,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-32))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+
+	vpclmulqdq	ymm2,ymm4,ymm6,0x01
+	vpshufd	ymm6,ymm6,0x4e
+	vpxor	ymm1,ymm1,ymm6
+	vpxor	ymm1,ymm1,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-16))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vextracti128	xmm2,ymm1,1
+	vpxor	xmm1,xmm1,xmm2
+
+
+	sub	rdx,-128
+	vpxor	ymm2,ymm10,YMMWORD[rcx]
+	vpxor	ymm3,ymm10,YMMWORD[32+rcx]
+	vpxor	ymm5,ymm10,YMMWORD[64+rcx]
+	vpxor	ymm6,ymm10,YMMWORD[96+rcx]
+	vaesenclast	ymm12,ymm12,ymm2
+	vaesenclast	ymm13,ymm13,ymm3
+	vaesenclast	ymm14,ymm14,ymm5
+	vaesenclast	ymm15,ymm15,ymm6
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	YMMWORD[32+rdx],ymm13
+	vmovdqu	YMMWORD[64+rdx],ymm14
+	vmovdqu	YMMWORD[96+rdx],ymm15
+
+	sub	rcx,-128
+
+	add	r8,-128
+	cmp	r8,127
+	ja	NEAR $L$crypt_loop_4x__func1
+$L$ghash_last_ciphertext_4x__func1:
+
+	vmovdqu	ymm3,YMMWORD[rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[rdi]
+	vpxor	ymm3,ymm3,ymm1
+	vpclmulqdq	ymm5,ymm3,ymm4,0x00
+	vpclmulqdq	ymm1,ymm3,ymm4,0x11
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm6,ymm2,ymm7,0x00
+
+	vmovdqu	ymm3,YMMWORD[32+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[32+rdi]
+	vpclmulqdq	ymm2,ymm3,ymm4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	vpclmulqdq	ymm2,ymm3,ymm4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm2,ymm2,ymm7,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vmovdqu	ymm3,YMMWORD[64+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[64+rdi]
+	vpclmulqdq	ymm2,ymm3,ymm4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	vpclmulqdq	ymm2,ymm3,ymm4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm2,ymm2,ymm8,0x00
+	vpxor	ymm6,ymm6,ymm2
+
+
+	vmovdqu	ymm3,YMMWORD[96+rdx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[96+rdi]
+	vpclmulqdq	ymm2,ymm3,ymm4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	vpclmulqdq	ymm2,ymm3,ymm4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm2,ymm2,ymm8,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm1
+
+
+	vbroadcasti128	ymm4,XMMWORD[$L$gfpoly]
+	vpclmulqdq	ymm2,ymm4,ymm5,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm2
+
+	vpclmulqdq	ymm2,ymm4,ymm6,0x01
+	vpshufd	ymm6,ymm6,0x4e
+	vpxor	ymm1,ymm1,ymm6
+	vpxor	ymm1,ymm1,ymm2
+	vextracti128	xmm2,ymm1,1
+	vpxor	xmm1,xmm1,xmm2
+
+	sub	rdx,-128
+$L$crypt_loop_4x_done__func1:
+
+	test	r8,r8
+	jz	NEAR $L$done__func1
+
+
+
+
+
+	lea	rsi,[128+rdi]
+	sub	rsi,r8
+
+
+	vpxor	xmm5,xmm5,xmm5
+	vpxor	xmm6,xmm6,xmm6
+	vpxor	xmm7,xmm7,xmm7
+
+	cmp	r8,64
+	jb	NEAR $L$lessthan64bytes__func1
+
+
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm13,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_1__func1:
+	vbroadcasti128	ymm2,XMMWORD[rax]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_1__func1
+	vaesenclast	ymm12,ymm12,ymm10
+	vaesenclast	ymm13,ymm13,ymm10
+
+
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vmovdqu	ymm3,YMMWORD[32+rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vpxor	ymm13,ymm13,ymm3
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	YMMWORD[32+rdx],ymm13
+
+
+	vpshufb	ymm12,ymm12,ymm0
+	vpshufb	ymm13,ymm13,ymm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	vmovdqu	ymm3,YMMWORD[32+rsi]
+	vpclmulqdq	ymm5,ymm12,ymm2,0x00
+	vpclmulqdq	ymm6,ymm12,ymm2,0x01
+	vpclmulqdq	ymm4,ymm12,ymm2,0x10
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	ymm7,ymm12,ymm2,0x11
+	vpclmulqdq	ymm4,ymm13,ymm3,0x00
+	vpxor	ymm5,ymm5,ymm4
+	vpclmulqdq	ymm4,ymm13,ymm3,0x01
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	ymm4,ymm13,ymm3,0x10
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	ymm4,ymm13,ymm3,0x11
+	vpxor	ymm7,ymm7,ymm4
+
+	add	rsi,64
+	add	rcx,64
+	add	rdx,64
+	sub	r8,64
+	jz	NEAR $L$reduce__func1
+
+	vpxor	xmm1,xmm1,xmm1
+
+
+$L$lessthan64bytes__func1:
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm13,ymm11,ymm0
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_2__func1:
+	vbroadcasti128	ymm2,XMMWORD[rax]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_2__func1
+	vaesenclast	ymm12,ymm12,ymm10
+	vaesenclast	ymm13,ymm13,ymm10
+
+
+
+
+	cmp	r8,32
+	jb	NEAR $L$xor_one_block__func1
+	je	NEAR $L$xor_two_blocks__func1
+
+$L$xor_three_blocks__func1:
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vmovdqu	xmm3,XMMWORD[32+rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vpxor	xmm13,xmm13,xmm3
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	XMMWORD[32+rdx],xmm13
+
+	vpshufb	ymm12,ymm12,ymm0
+	vpshufb	xmm13,xmm13,xmm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	vmovdqu	xmm3,XMMWORD[32+rsi]
+	vpclmulqdq	xmm4,xmm13,xmm3,0x00
+	vpxor	ymm5,ymm5,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x01
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x10
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x11
+	vpxor	ymm7,ymm7,ymm4
+	jmp	NEAR $L$ghash_mul_one_vec_unreduced__func1
+
+$L$xor_two_blocks__func1:
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vmovdqu	YMMWORD[rdx],ymm12
+	vpshufb	ymm12,ymm12,ymm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	jmp	NEAR $L$ghash_mul_one_vec_unreduced__func1
+
+$L$xor_one_block__func1:
+	vmovdqu	xmm2,XMMWORD[rcx]
+	vpxor	xmm12,xmm12,xmm2
+	vmovdqu	XMMWORD[rdx],xmm12
+	vpshufb	xmm12,xmm12,xmm0
+	vpxor	xmm12,xmm12,xmm1
+	vmovdqu	xmm2,XMMWORD[rsi]
+
+$L$ghash_mul_one_vec_unreduced__func1:
+	vpclmulqdq	ymm4,ymm12,ymm2,0x00
+	vpxor	ymm5,ymm5,ymm4
+	vpclmulqdq	ymm4,ymm12,ymm2,0x01
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	ymm4,ymm12,ymm2,0x10
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	ymm4,ymm12,ymm2,0x11
+	vpxor	ymm7,ymm7,ymm4
+
+$L$reduce__func1:
+
+	vbroadcasti128	ymm2,XMMWORD[$L$gfpoly]
+	vpclmulqdq	ymm3,ymm2,ymm5,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm3
+	vpclmulqdq	ymm3,ymm2,ymm6,0x01
+	vpshufd	ymm6,ymm6,0x4e
+	vpxor	ymm7,ymm7,ymm6
+	vpxor	ymm7,ymm7,ymm3
+	vextracti128	xmm1,ymm7,1
+	vpxor	xmm1,xmm1,xmm7
+
+$L$done__func1:
+
+	vpshufb	xmm1,xmm1,xmm0
+	vmovdqu	XMMWORD[r12],xmm1
+
+	vzeroupper
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	movdqa	xmm10,XMMWORD[64+rsp]
+	movdqa	xmm11,XMMWORD[80+rsp]
+	movdqa	xmm12,XMMWORD[96+rsp]
+	movdqa	xmm13,XMMWORD[112+rsp]
+	movdqa	xmm14,XMMWORD[128+rsp]
+	movdqa	xmm15,XMMWORD[144+rsp]
+	add	rsp,160
+	pop	r12
+	pop	rdi
+	pop	rsi
+	ret
+$L$SEH_end_aes_gcm_enc_update_vaes_avx2_17:
+
+
+global	aes_gcm_dec_update_vaes_avx2
+
+ALIGN	32
+aes_gcm_dec_update_vaes_avx2:
+
+$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1:
+_CET_ENDBR
+	push	rsi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2:
+	push	rdi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3:
+	push	r12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4:
+
+	mov	rsi,QWORD[64+rsp]
+	mov	rdi,QWORD[72+rsp]
+	mov	r12,QWORD[80+rsp]
+	sub	rsp,160
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5:
+	movdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6:
+	movdqa	XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7:
+	movdqa	XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8:
+	movdqa	XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9:
+	movdqa	XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10:
+	movdqa	XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11:
+	movdqa	XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12:
+	movdqa	XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13:
+	movdqa	XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14:
+	movdqa	XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15:
+
+$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16:
+	vbroadcasti128	ymm0,XMMWORD[$L$bswap_mask]
+
+
+
+	vmovdqu	xmm1,XMMWORD[r12]
+	vpshufb	xmm1,xmm1,xmm0
+	vbroadcasti128	ymm11,XMMWORD[rsi]
+	vpshufb	ymm11,ymm11,ymm0
+
+
+
+	mov	r10d,DWORD[240+r9]
+	lea	r10d,[((-20))+r10*4]
+
+
+
+
+	lea	r11,[96+r10*4+r9]
+	vbroadcasti128	ymm9,XMMWORD[r9]
+	vbroadcasti128	ymm10,XMMWORD[r11]
+
+
+	vpaddd	ymm11,ymm11,YMMWORD[$L$ctr_pattern]
+
+
+
+	cmp	r8,127
+	jbe	NEAR $L$crypt_loop_4x_done__func2
+
+	vmovdqu	ymm7,YMMWORD[128+rdi]
+	vmovdqu	ymm8,YMMWORD[((128+32))+rdi]
+ALIGN	16
+$L$crypt_loop_4x__func2:
+
+
+
+
+	vmovdqu	ymm2,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm13,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm14,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+	vpshufb	ymm15,ymm11,ymm0
+	vpaddd	ymm11,ymm11,ymm2
+
+
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	vpxor	ymm14,ymm14,ymm9
+	vpxor	ymm15,ymm15,ymm9
+
+	cmp	r10d,24
+	jl	NEAR $L$aes128__func2
+	je	NEAR $L$aes192__func2
+
+	vbroadcasti128	ymm2,XMMWORD[((-208))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-192))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+$L$aes192__func2:
+	vbroadcasti128	ymm2,XMMWORD[((-176))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-160))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+$L$aes128__func2:
+
+	vmovdqu	ymm3,YMMWORD[rcx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[rdi]
+	vpxor	ymm3,ymm3,ymm1
+	vpclmulqdq	ymm5,ymm3,ymm4,0x00
+	vpclmulqdq	ymm1,ymm3,ymm4,0x11
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm6,ymm2,ymm7,0x00
+
+	vbroadcasti128	ymm2,XMMWORD[((-144))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+
+	vbroadcasti128	ymm2,XMMWORD[((-128))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+
+	vmovdqu	ymm3,YMMWORD[32+rcx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[32+rdi]
+	vpclmulqdq	ymm2,ymm3,ymm4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	vpclmulqdq	ymm2,ymm3,ymm4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm2,ymm2,ymm7,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-112))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+
+	vmovdqu	ymm3,YMMWORD[64+rcx]
+	vpshufb	ymm3,ymm3,ymm0
+	vmovdqu	ymm4,YMMWORD[64+rdi]
+
+	vbroadcasti128	ymm2,XMMWORD[((-96))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vpclmulqdq	ymm2,ymm3,ymm4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	vpclmulqdq	ymm2,ymm3,ymm4,0x11
+	vpxor	ymm1,ymm1,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-80))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm2,ymm2,ymm8,0x00
+	vpxor	ymm6,ymm6,ymm2
+
+
+	vmovdqu	ymm3,YMMWORD[96+rcx]
+	vpshufb	ymm3,ymm3,ymm0
+
+	vbroadcasti128	ymm2,XMMWORD[((-64))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vmovdqu	ymm4,YMMWORD[96+rdi]
+	vpclmulqdq	ymm2,ymm3,ymm4,0x00
+	vpxor	ymm5,ymm5,ymm2
+	vpclmulqdq	ymm2,ymm3,ymm4,0x11
+	vpxor	ymm1,ymm1,ymm2
+	vpunpckhqdq	ymm2,ymm3,ymm3
+	vpxor	ymm2,ymm2,ymm3
+	vpclmulqdq	ymm2,ymm2,ymm8,0x10
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-48))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm1
+
+
+	vbroadcasti128	ymm4,XMMWORD[$L$gfpoly]
+	vpclmulqdq	ymm2,ymm4,ymm5,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-32))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+
+	vpclmulqdq	ymm2,ymm4,ymm6,0x01
+	vpshufd	ymm6,ymm6,0x4e
+	vpxor	ymm1,ymm1,ymm6
+	vpxor	ymm1,ymm1,ymm2
+
+	vbroadcasti128	ymm2,XMMWORD[((-16))+r11]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	vaesenc	ymm14,ymm14,ymm2
+	vaesenc	ymm15,ymm15,ymm2
+
+	vextracti128	xmm2,ymm1,1
+	vpxor	xmm1,xmm1,xmm2
+
+
+
+	vpxor	ymm2,ymm10,YMMWORD[rcx]
+	vpxor	ymm3,ymm10,YMMWORD[32+rcx]
+	vpxor	ymm5,ymm10,YMMWORD[64+rcx]
+	vpxor	ymm6,ymm10,YMMWORD[96+rcx]
+	vaesenclast	ymm12,ymm12,ymm2
+	vaesenclast	ymm13,ymm13,ymm3
+	vaesenclast	ymm14,ymm14,ymm5
+	vaesenclast	ymm15,ymm15,ymm6
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	YMMWORD[32+rdx],ymm13
+	vmovdqu	YMMWORD[64+rdx],ymm14
+	vmovdqu	YMMWORD[96+rdx],ymm15
+
+	sub	rcx,-128
+	sub	rdx,-128
+	add	r8,-128
+	cmp	r8,127
+	ja	NEAR $L$crypt_loop_4x__func2
+$L$crypt_loop_4x_done__func2:
+
+	test	r8,r8
+	jz	NEAR $L$done__func2
+
+
+
+
+
+	lea	rsi,[128+rdi]
+	sub	rsi,r8
+
+
+	vpxor	xmm5,xmm5,xmm5
+	vpxor	xmm6,xmm6,xmm6
+	vpxor	xmm7,xmm7,xmm7
+
+	cmp	r8,64
+	jb	NEAR $L$lessthan64bytes__func2
+
+
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm13,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_1__func2:
+	vbroadcasti128	ymm2,XMMWORD[rax]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_1__func2
+	vaesenclast	ymm12,ymm12,ymm10
+	vaesenclast	ymm13,ymm13,ymm10
+
+
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vmovdqu	ymm3,YMMWORD[32+rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vpxor	ymm13,ymm13,ymm3
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	YMMWORD[32+rdx],ymm13
+
+
+	vpshufb	ymm12,ymm2,ymm0
+	vpshufb	ymm13,ymm3,ymm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	vmovdqu	ymm3,YMMWORD[32+rsi]
+	vpclmulqdq	ymm5,ymm12,ymm2,0x00
+	vpclmulqdq	ymm6,ymm12,ymm2,0x01
+	vpclmulqdq	ymm4,ymm12,ymm2,0x10
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	ymm7,ymm12,ymm2,0x11
+	vpclmulqdq	ymm4,ymm13,ymm3,0x00
+	vpxor	ymm5,ymm5,ymm4
+	vpclmulqdq	ymm4,ymm13,ymm3,0x01
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	ymm4,ymm13,ymm3,0x10
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	ymm4,ymm13,ymm3,0x11
+	vpxor	ymm7,ymm7,ymm4
+
+	add	rsi,64
+	add	rcx,64
+	add	rdx,64
+	sub	r8,64
+	jz	NEAR $L$reduce__func2
+
+	vpxor	xmm1,xmm1,xmm1
+
+
+$L$lessthan64bytes__func2:
+	vpshufb	ymm12,ymm11,ymm0
+	vpaddd	ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+	vpshufb	ymm13,ymm11,ymm0
+	vpxor	ymm12,ymm12,ymm9
+	vpxor	ymm13,ymm13,ymm9
+	lea	rax,[16+r9]
+$L$vaesenc_loop_tail_2__func2:
+	vbroadcasti128	ymm2,XMMWORD[rax]
+	vaesenc	ymm12,ymm12,ymm2
+	vaesenc	ymm13,ymm13,ymm2
+	add	rax,16
+	cmp	r11,rax
+	jne	NEAR $L$vaesenc_loop_tail_2__func2
+	vaesenclast	ymm12,ymm12,ymm10
+	vaesenclast	ymm13,ymm13,ymm10
+
+
+
+
+	cmp	r8,32
+	jb	NEAR $L$xor_one_block__func2
+	je	NEAR $L$xor_two_blocks__func2
+
+$L$xor_three_blocks__func2:
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vmovdqu	xmm3,XMMWORD[32+rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vpxor	xmm13,xmm13,xmm3
+	vmovdqu	YMMWORD[rdx],ymm12
+	vmovdqu	XMMWORD[32+rdx],xmm13
+
+	vpshufb	ymm12,ymm2,ymm0
+	vpshufb	xmm13,xmm3,xmm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	vmovdqu	xmm3,XMMWORD[32+rsi]
+	vpclmulqdq	xmm4,xmm13,xmm3,0x00
+	vpxor	ymm5,ymm5,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x01
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x10
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	xmm4,xmm13,xmm3,0x11
+	vpxor	ymm7,ymm7,ymm4
+	jmp	NEAR $L$ghash_mul_one_vec_unreduced__func2
+
+$L$xor_two_blocks__func2:
+	vmovdqu	ymm2,YMMWORD[rcx]
+	vpxor	ymm12,ymm12,ymm2
+	vmovdqu	YMMWORD[rdx],ymm12
+	vpshufb	ymm12,ymm2,ymm0
+	vpxor	ymm12,ymm12,ymm1
+	vmovdqu	ymm2,YMMWORD[rsi]
+	jmp	NEAR $L$ghash_mul_one_vec_unreduced__func2
+
+$L$xor_one_block__func2:
+	vmovdqu	xmm2,XMMWORD[rcx]
+	vpxor	xmm12,xmm12,xmm2
+	vmovdqu	XMMWORD[rdx],xmm12
+	vpshufb	xmm12,xmm2,xmm0
+	vpxor	xmm12,xmm12,xmm1
+	vmovdqu	xmm2,XMMWORD[rsi]
+
+$L$ghash_mul_one_vec_unreduced__func2:
+	vpclmulqdq	ymm4,ymm12,ymm2,0x00
+	vpxor	ymm5,ymm5,ymm4
+	vpclmulqdq	ymm4,ymm12,ymm2,0x01
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	ymm4,ymm12,ymm2,0x10
+	vpxor	ymm6,ymm6,ymm4
+	vpclmulqdq	ymm4,ymm12,ymm2,0x11
+	vpxor	ymm7,ymm7,ymm4
+
+$L$reduce__func2:
+
+	vbroadcasti128	ymm2,XMMWORD[$L$gfpoly]
+	vpclmulqdq	ymm3,ymm2,ymm5,0x01
+	vpshufd	ymm5,ymm5,0x4e
+	vpxor	ymm6,ymm6,ymm5
+	vpxor	ymm6,ymm6,ymm3
+	vpclmulqdq	ymm3,ymm2,ymm6,0x01
+	vpshufd	ymm6,ymm6,0x4e
+	vpxor	ymm7,ymm7,ymm6
+	vpxor	ymm7,ymm7,ymm3
+	vextracti128	xmm1,ymm7,1
+	vpxor	xmm1,xmm1,xmm7
+
+$L$done__func2:
+
+	vpshufb	xmm1,xmm1,xmm0
+	vmovdqu	XMMWORD[r12],xmm1
+
+	vzeroupper
+	movdqa	xmm6,XMMWORD[rsp]
+	movdqa	xmm7,XMMWORD[16+rsp]
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+	movdqa	xmm10,XMMWORD[64+rsp]
+	movdqa	xmm11,XMMWORD[80+rsp]
+	movdqa	xmm12,XMMWORD[96+rsp]
+	movdqa	xmm13,XMMWORD[112+rsp]
+	movdqa	xmm14,XMMWORD[128+rsp]
+	movdqa	xmm15,XMMWORD[144+rsp]
+	add	rsp,160
+	pop	r12
+	pop	rdi
+	pop	rsi
+	ret
+$L$SEH_end_aes_gcm_dec_update_vaes_avx2_17:
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_init_vpclmulqdq_avx2_5 wrt ..imagebase
+	DD	$L$SEH_info_gcm_init_vpclmulqdq_avx2_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_gmult_vpclmulqdq_avx2_5 wrt ..imagebase
+	DD	$L$SEH_info_gcm_gmult_vpclmulqdq_avx2_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 wrt ..imagebase
+	DD	$L$SEH_end_gcm_ghash_vpclmulqdq_avx2_8 wrt ..imagebase
+	DD	$L$SEH_info_gcm_ghash_vpclmulqdq_avx2_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_gcm_enc_update_vaes_avx2_17 wrt ..imagebase
+	DD	$L$SEH_info_aes_gcm_enc_update_vaes_avx2_0 wrt ..imagebase
+
+	DD	$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 wrt ..imagebase
+	DD	$L$SEH_end_aes_gcm_dec_update_vaes_avx2_17 wrt ..imagebase
+	DD	$L$SEH_info_aes_gcm_dec_update_vaes_avx2_0 wrt ..imagebase
+
+
+section	.xdata rdata align=8
+ALIGN	4
+$L$SEH_info_gcm_init_vpclmulqdq_avx2_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1
+	DB	3
+	DB	0
+	DB	$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1
+	DB	34
+
+	DW	0
+$L$SEH_info_gcm_gmult_vpclmulqdq_avx2_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1
+	DB	3
+	DB	0
+	DB	$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1
+	DB	34
+
+	DW	0
+$L$SEH_info_gcm_ghash_vpclmulqdq_avx2_0:
+	DB	1
+	DB	$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+	DB	9
+	DB	0
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+	DB	130
+
+	DW	0
+$L$SEH_info_aes_gcm_enc_update_vaes_avx2_0:
+	DB	1
+	DB	$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	25
+	DB	0
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	1
+	DW	20
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	192
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	112
+	DB	$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+	DB	96
+
+	DW	0
+$L$SEH_info_aes_gcm_dec_update_vaes_avx2_0:
+	DB	1
+	DB	$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	25
+	DB	0
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	248
+	DW	9
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	232
+	DW	8
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	216
+	DW	7
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	200
+	DW	6
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	184
+	DW	5
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	168
+	DW	4
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	152
+	DW	3
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	136
+	DW	2
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	120
+	DW	1
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	104
+	DW	0
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	1
+	DW	20
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	192
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	112
+	DB	$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+	DB	96
+
+	DW	0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/sources.bzl b/gen/sources.bzl
index f91b49e..5af0dd2 100644
--- a/gen/sources.bzl
+++ b/gen/sources.bzl
@@ -104,6 +104,8 @@
 bcm_sources_asm = [
     "gen/bcm/aes-gcm-avx10-x86_64-apple.S",
     "gen/bcm/aes-gcm-avx10-x86_64-linux.S",
+    "gen/bcm/aes-gcm-avx2-x86_64-apple.S",
+    "gen/bcm/aes-gcm-avx2-x86_64-linux.S",
     "gen/bcm/aesni-gcm-x86_64-apple.S",
     "gen/bcm/aesni-gcm-x86_64-linux.S",
     "gen/bcm/aesni-x86-apple.S",
@@ -203,6 +205,7 @@
 
 bcm_sources_nasm = [
     "gen/bcm/aes-gcm-avx10-x86_64-win.asm",
+    "gen/bcm/aes-gcm-avx2-x86_64-win.asm",
     "gen/bcm/aesni-gcm-x86_64-win.asm",
     "gen/bcm/aesni-x86-win.asm",
     "gen/bcm/aesni-x86_64-win.asm",
diff --git a/gen/sources.cmake b/gen/sources.cmake
index 369a9e6..bbbb9c2 100644
--- a/gen/sources.cmake
+++ b/gen/sources.cmake
@@ -110,6 +110,8 @@
 
   gen/bcm/aes-gcm-avx10-x86_64-apple.S
   gen/bcm/aes-gcm-avx10-x86_64-linux.S
+  gen/bcm/aes-gcm-avx2-x86_64-apple.S
+  gen/bcm/aes-gcm-avx2-x86_64-linux.S
   gen/bcm/aesni-gcm-x86_64-apple.S
   gen/bcm/aesni-gcm-x86_64-linux.S
   gen/bcm/aesni-x86-apple.S
@@ -211,6 +213,7 @@
   BCM_SOURCES_NASM
 
   gen/bcm/aes-gcm-avx10-x86_64-win.asm
+  gen/bcm/aes-gcm-avx2-x86_64-win.asm
   gen/bcm/aesni-gcm-x86_64-win.asm
   gen/bcm/aesni-x86-win.asm
   gen/bcm/aesni-x86_64-win.asm
diff --git a/gen/sources.gni b/gen/sources.gni
index d9862d9..b5c3d54 100644
--- a/gen/sources.gni
+++ b/gen/sources.gni
@@ -104,6 +104,8 @@
 bcm_sources_asm = [
   "gen/bcm/aes-gcm-avx10-x86_64-apple.S",
   "gen/bcm/aes-gcm-avx10-x86_64-linux.S",
+  "gen/bcm/aes-gcm-avx2-x86_64-apple.S",
+  "gen/bcm/aes-gcm-avx2-x86_64-linux.S",
   "gen/bcm/aesni-gcm-x86_64-apple.S",
   "gen/bcm/aesni-gcm-x86_64-linux.S",
   "gen/bcm/aesni-x86-apple.S",
@@ -203,6 +205,7 @@
 
 bcm_sources_nasm = [
   "gen/bcm/aes-gcm-avx10-x86_64-win.asm",
+  "gen/bcm/aes-gcm-avx2-x86_64-win.asm",
   "gen/bcm/aesni-gcm-x86_64-win.asm",
   "gen/bcm/aesni-x86-win.asm",
   "gen/bcm/aesni-x86_64-win.asm",
diff --git a/gen/sources.json b/gen/sources.json
index 1b482e1..c4604c8 100644
--- a/gen/sources.json
+++ b/gen/sources.json
@@ -88,6 +88,8 @@
     "asm": [
       "gen/bcm/aes-gcm-avx10-x86_64-apple.S",
       "gen/bcm/aes-gcm-avx10-x86_64-linux.S",
+      "gen/bcm/aes-gcm-avx2-x86_64-apple.S",
+      "gen/bcm/aes-gcm-avx2-x86_64-linux.S",
       "gen/bcm/aesni-gcm-x86_64-apple.S",
       "gen/bcm/aesni-gcm-x86_64-linux.S",
       "gen/bcm/aesni-x86-apple.S",
@@ -186,6 +188,7 @@
     ],
     "nasm": [
       "gen/bcm/aes-gcm-avx10-x86_64-win.asm",
+      "gen/bcm/aes-gcm-avx2-x86_64-win.asm",
       "gen/bcm/aesni-gcm-x86_64-win.asm",
       "gen/bcm/aesni-x86-win.asm",
       "gen/bcm/aesni-x86_64-win.asm",