Add VAES + AVX2 optimized AES-GCM
Add a VAES-optimized AES-GCM implementation that is optimized for AMD
Zen 3 processors, using AVX2 instead of AVX512 / AVX10. With AVX2 only
16 vector registers are available and some instructions are missing,
which is inconvenient and makes the code not easily sharable with the
AVX512 / AVX10 version. However, using VAES still gives a significant
performance improvement, about 80-85% on long messages as shown by the
following tables which show the change in AES-256-GCM throughput in MB/s
on a Zen 3 "Milan" processor for various message lengths in bytes.
Encryption:
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
--------+-------+-------+-------+-------+-------+-------+
Before | 3955 | 3749 | 3597 | 3054 | 2411 | 2038 |
After | 7128 | 6631 | 5975 | 4788 | 3807 | 2676 |
| 300 | 200 | 64 | 63 | 16 |
--------+-------+-------+-------+-------+-------+
Before | 1757 | 1405 | 856 | 602 | 356 |
After | 1885 | 1430 | 940 | 593 | 381 |
Decryption:
| 16384 | 4096 | 4095 | 1420 | 512 | 500 |
--------+-------+-------+-------+-------+-------+-------+
Before | 3962 | 3774 | 3593 | 2978 | 2510 | 1998 |
After | 7378 | 6836 | 6282 | 4826 | 3868 | 2753 |
| 300 | 200 | 64 | 63 | 16 |
--------+-------+-------+-------+-------+-------+
Before | 1742 | 1428 | 856 | 535 | 383 |
After | 1940 | 1534 | 940 | 573 | 383 |
Change-Id: I583dd6b48b81ab3c6df51bfe8729366cad500537
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/74368
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/build.json b/build.json
index 4d3fb4e..71489c9 100644
--- a/build.json
+++ b/build.json
@@ -141,6 +141,7 @@
"perlasm_x86_64": [
{"src": "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl"},
{"src": "crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl"},
+ {"src": "crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl"},
{"src": "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"},
{"src": "crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl"},
{"src": "crypto/fipsmodule/modes/asm/ghash-x86_64.pl"},
diff --git a/crypto/crypto.cc b/crypto/crypto.cc
index 912a993..ac0928f 100644
--- a/crypto/crypto.cc
+++ b/crypto/crypto.cc
@@ -54,7 +54,7 @@
// archive, linking on OS X will fail to resolve common symbols. By
// initialising it to zero, it becomes a "data symbol", which isn't so
// affected.
-HIDDEN uint8_t BORINGSSL_function_hit[8] = {0};
+HIDDEN uint8_t BORINGSSL_function_hit[9] = {0};
#endif
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
diff --git a/crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl b/crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl
new file mode 100644
index 0000000..6ea956b
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl
@@ -0,0 +1,1027 @@
+#!/usr/bin/env perl
+# Copyright 2024 The BoringSSL Authors
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+#------------------------------------------------------------------------------
+#
+# VAES and VPCLMULQDQ optimized AES-GCM for x86_64 (AVX2 version)
+#
+# This is similar to aes-gcm-avx10-x86_64.pl, but it uses AVX2 instead of AVX512
+# / AVX10. This means it can only use 16 vector registers instead of 32, the
+# maximum vector length is 32 bytes, and some instructions such as vpternlogd
+# and masked loads/stores are unavailable. However, it is able to run on CPUs
+# that have VAES without AVX512 / AVX10, namely AMD Zen 3 (including "Milan"
+# server processors) and some Intel client CPUs such as Alder Lake.
+#
+# This implementation also uses Karatsuba multiplication instead of schoolbook
+# multiplication for GHASH in its main loop. This does not help much on Intel,
+# but it improves performance by ~5% on AMD Zen 3 which is the main target for
+# this implementation. Other factors weighing slightly in favor of Karatsuba
+# multiplication in this implementation are the lower maximum vector length
+# (which means there is space left in the Htable array to cache the halves of
+# the key powers XOR'd together) and the unavailability of the vpternlogd
+# instruction (which helped schoolbook a bit more than Karatsuba).
+
+use strict;
+
+my $flavour = shift;
+my $output = shift;
+if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; }
+
+my $win64;
+my @argregs;
+if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) {
+ $win64 = 1;
+ @argregs = ( "%rcx", "%rdx", "%r8", "%r9" );
+}
+else {
+ $win64 = 0;
+ @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" );
+}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir = $1;
+my $xlate;
+( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate )
+ or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate )
+ or die "can't locate x86_64-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+my $g_cur_func_name;
+my $g_cur_func_uses_seh;
+my @g_cur_func_saved_gpregs;
+my @g_cur_func_saved_xmmregs;
+
+sub _begin_func {
+ my ( $funcname, $uses_seh ) = @_;
+ $g_cur_func_name = $funcname;
+ $g_cur_func_uses_seh = $uses_seh;
+ @g_cur_func_saved_gpregs = ();
+ @g_cur_func_saved_xmmregs = ();
+ return <<___;
+.globl $funcname
+.type $funcname,\@abi-omnipotent
+.align 32
+$funcname:
+ .cfi_startproc
+ @{[ $uses_seh ? ".seh_startproc" : "" ]}
+ _CET_ENDBR
+___
+}
+
+# Push a list of general purpose registers onto the stack.
+sub _save_gpregs {
+ my @gpregs = @_;
+ my $code = "";
+ die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh;
+ die "_save_gpregs can only be called once per function"
+ if @g_cur_func_saved_gpregs;
+ die "Order must be _save_gpregs, then _save_xmmregs"
+ if @g_cur_func_saved_xmmregs;
+ @g_cur_func_saved_gpregs = @gpregs;
+ for my $reg (@gpregs) {
+ $code .= "push $reg\n";
+ if ($win64) {
+ $code .= ".seh_pushreg $reg\n";
+ }
+ else {
+ $code .= ".cfi_push $reg\n";
+ }
+ }
+ return $code;
+}
+
+# Push a list of xmm registers onto the stack if the target is Windows.
+sub _save_xmmregs {
+ my @xmmregs = @_;
+ my $num_xmmregs = scalar @xmmregs;
+ my $code = "";
+ die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh;
+ die "_save_xmmregs can only be called once per function"
+ if @g_cur_func_saved_xmmregs;
+ if ( $win64 and $num_xmmregs > 0 ) {
+ @g_cur_func_saved_xmmregs = @xmmregs;
+ my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
+ my $alloc_size = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 );
+ $code .= "sub \$$alloc_size, %rsp\n";
+ $code .= ".seh_stackalloc $alloc_size\n";
+ for my $i ( 0 .. $num_xmmregs - 1 ) {
+ my $reg_num = $xmmregs[$i];
+ my $pos = 16 * $i;
+ $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n";
+ $code .= ".seh_savexmm %xmm$reg_num, $pos\n";
+ }
+ }
+ return $code;
+}
+
+sub _end_func {
+ my $code = "";
+
+ # Restore any xmm registers that were saved earlier.
+ my $num_xmmregs = scalar @g_cur_func_saved_xmmregs;
+ if ( $win64 and $num_xmmregs > 0 ) {
+ my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
+ my $alloc_size = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 );
+ for my $i ( 0 .. $num_xmmregs - 1 ) {
+ my $reg_num = $g_cur_func_saved_xmmregs[$i];
+ my $pos = 16 * $i;
+ $code .= "movdqa $pos(%rsp), %xmm$reg_num\n";
+ }
+ $code .= "add \$$alloc_size, %rsp\n";
+ }
+
+ # Restore any general purpose registers that were saved earlier.
+ for my $reg ( reverse @g_cur_func_saved_gpregs ) {
+ $code .= "pop $reg\n";
+ if ( !$win64 ) {
+ $code .= ".cfi_pop $reg\n";
+ }
+ }
+
+ $code .= <<___;
+ ret
+ @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]}
+ .cfi_endproc
+ .size $g_cur_func_name, . - $g_cur_func_name
+___
+ return $code;
+}
+
+my $code = <<___;
+.section .rodata
+.align 16
+
+ # A shuffle mask that reflects the bytes of 16-byte blocks
+.Lbswap_mask:
+ .quad 0x08090a0b0c0d0e0f, 0x0001020304050607
+
+ # This is the GHASH reducing polynomial without its constant term, i.e.
+ # x^128 + x^7 + x^2 + x, represented using the backwards mapping
+ # between bits and polynomial coefficients.
+ #
+ # Alternatively, it can be interpreted as the naturally-ordered
+ # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+ # "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+ .quad 1, 0xc200000000000000
+
+ # Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+ .quad 1, 0xc200000000000001
+
+.align 32
+ # The below constants are used for incrementing the counter blocks.
+.Lctr_pattern:
+ .quad 0, 0
+ .quad 1, 0
+.Linc_2blocks:
+ .quad 2, 0
+ .quad 2, 0
+
+.text
+___
+
+# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the
+# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication)
+# in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15].
+my $NUM_H_POWERS = 8;
+my $OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16;
+my $OFFSETOF_H_POWERS_XORED = $OFFSETOFEND_H_POWERS;
+
+# Offset to 'rounds' in AES_KEY struct
+my $OFFSETOF_AES_ROUNDS = 240;
+
+# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+# the reduced products in \dst. Uses schoolbook multiplication.
+sub _ghash_mul {
+ my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
+ return <<___;
+ vpclmulqdq \$0x00, $a, $b, $t0 # LO = a_L * b_L
+ vpclmulqdq \$0x01, $a, $b, $t1 # MI_0 = a_L * b_H
+ vpclmulqdq \$0x10, $a, $b, $t2 # MI_1 = a_H * b_L
+ vpxor $t2, $t1, $t1 # MI = MI_0 + MI_1
+ vpclmulqdq \$0x01, $t0, $gfpoly, $t2 # LO_L*(x^63 + x^62 + x^57)
+ vpshufd \$0x4e, $t0, $t0 # Swap halves of LO
+ vpxor $t0, $t1, $t1 # Fold LO into MI (part 1)
+ vpxor $t2, $t1, $t1 # Fold LO into MI (part 2)
+ vpclmulqdq \$0x11, $a, $b, $dst # HI = a_H * b_H
+ vpclmulqdq \$0x01, $t1, $gfpoly, $t0 # MI_L*(x^63 + x^62 + x^57)
+ vpshufd \$0x4e, $t1, $t1 # Swap halves of MI
+ vpxor $t1, $dst, $dst # Fold MI into HI (part 1)
+ vpxor $t0, $dst, $dst # Fold MI into HI (part 2)
+___
+}
+
+# void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]);
+#
+# Initialize |Htable| with powers of the GHASH subkey |H|.
+#
+# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the
+# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication)
+# in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15].
+$code .= _begin_func "gcm_init_vpclmulqdq_avx2", 1;
+{
+ my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ];
+ my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" );
+ my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" );
+ my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" );
+ my ( $H_CUR, $H_CUR_XMM ) = ( "%ymm3", "%xmm3" );
+ my ( $H_CUR2, $H_CUR2_XMM ) = ( "%ymm4", "%xmm4" );
+ my ( $H_INC, $H_INC_XMM ) = ( "%ymm5", "%xmm5" );
+ my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm6", "%xmm6" );
+
+ $code .= <<___;
+ @{[ _save_xmmregs (6) ]}
+ .seh_endprologue
+
+ # Load the byte-reflected hash subkey. BoringSSL provides it in
+ # byte-reflected form except the two halves are in the wrong order.
+ vpshufd \$0x4e, ($H_PTR), $H_CUR_XMM
+
+ # Finish preprocessing the byte-reflected hash subkey by multiplying it by
+ # x^-1 ("standard" interpretation of polynomial coefficients) or
+ # equivalently x^1 (natural interpretation). This gets the key into a
+ # format that avoids having to bit-reflect the data blocks later.
+ vpshufd \$0xd3, $H_CUR_XMM, $TMP0_XMM
+ vpsrad \$31, $TMP0_XMM, $TMP0_XMM
+ vpaddq $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM
+ vpand .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $TMP0_XMM
+ vpxor $TMP0_XMM, $H_CUR_XMM, $H_CUR_XMM
+
+ vbroadcasti128 .Lgfpoly(%rip), $GFPOLY
+
+ # Square H^1 to get H^2.
+ @{[ _ghash_mul $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM,
+ $TMP0_XMM, $TMP1_XMM, $TMP2_XMM ]}
+
+ # Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2].
+ vinserti128 \$1, $H_CUR_XMM, $H_INC, $H_CUR
+ vinserti128 \$1, $H_INC_XMM, $H_INC, $H_INC
+
+ # Compute H_CUR2 = [H^4, H^3].
+ @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
+
+ # Store [H^2, H^1] and [H^4, H^3].
+ vmovdqu $H_CUR, 3*32($HTABLE)
+ vmovdqu $H_CUR2, 2*32($HTABLE)
+
+ # For Karatsuba multiplication: compute and store the two 64-bit halves of
+ # each key power XOR'd together. Order is 4,2,3,1.
+ vpunpcklqdq $H_CUR, $H_CUR2, $TMP0
+ vpunpckhqdq $H_CUR, $H_CUR2, $TMP1
+ vpxor $TMP1, $TMP0, $TMP0
+ vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED+32($HTABLE)
+
+ # Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7].
+ @{[ _ghash_mul $H_INC, $H_CUR2, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
+ @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
+ vmovdqu $H_CUR, 1*32($HTABLE)
+ vmovdqu $H_CUR2, 0*32($HTABLE)
+
+ # Again, compute and store the two 64-bit halves of each key power XOR'd
+ # together. Order is 8,6,7,5.
+ vpunpcklqdq $H_CUR, $H_CUR2, $TMP0
+ vpunpckhqdq $H_CUR, $H_CUR2, $TMP1
+ vpxor $TMP1, $TMP0, $TMP0
+ vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED($HTABLE)
+
+ vzeroupper
+___
+}
+$code .= _end_func;
+
+# Do one step of the GHASH update of four vectors of data blocks.
+# $i: the step to do, 0 through 9
+# $ghashdata_ptr: pointer to the data blocks (ciphertext or AAD)
+# $htable: pointer to the Htable for the key
+# $bswap_mask: mask for reflecting the bytes of blocks
+# $h_pow[2-1]_xored: XOR'd key powers cached from Htable
+# $tmp[0-2]: temporary registers. $tmp[1-2] must be preserved across steps.
+# $lo, $mi: working state for this macro that must be preserved across steps
+# $ghash_acc: the GHASH accumulator (input/output)
+sub _ghash_step_4x {
+ my (
+ $i, $ghashdata_ptr, $htable, $bswap_mask,
+ $h_pow2_xored, $h_pow1_xored, $tmp0, $tmp0_xmm,
+ $tmp1, $tmp2, $lo, $mi,
+ $ghash_acc, $ghash_acc_xmm
+ ) = @_;
+ my ( $hi, $hi_xmm ) = ( $ghash_acc, $ghash_acc_xmm ); # alias
+ if ( $i == 0 ) {
+ return <<___;
+ # First vector
+ vmovdqu 0*32($ghashdata_ptr), $tmp1
+ vpshufb $bswap_mask, $tmp1, $tmp1
+ vmovdqu 0*32($htable), $tmp2
+ vpxor $ghash_acc, $tmp1, $tmp1
+ vpclmulqdq \$0x00, $tmp2, $tmp1, $lo
+ vpclmulqdq \$0x11, $tmp2, $tmp1, $hi
+ vpunpckhqdq $tmp1, $tmp1, $tmp0
+ vpxor $tmp1, $tmp0, $tmp0
+ vpclmulqdq \$0x00, $h_pow2_xored, $tmp0, $mi
+___
+ }
+ elsif ( $i == 1 ) {
+ return <<___;
+___
+ }
+ elsif ( $i == 2 ) {
+ return <<___;
+ # Second vector
+ vmovdqu 1*32($ghashdata_ptr), $tmp1
+ vpshufb $bswap_mask, $tmp1, $tmp1
+ vmovdqu 1*32($htable), $tmp2
+ vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0
+ vpxor $tmp0, $lo, $lo
+ vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0
+ vpxor $tmp0, $hi, $hi
+ vpunpckhqdq $tmp1, $tmp1, $tmp0
+ vpxor $tmp1, $tmp0, $tmp0
+ vpclmulqdq \$0x10, $h_pow2_xored, $tmp0, $tmp0
+ vpxor $tmp0, $mi, $mi
+___
+ }
+ elsif ( $i == 3 ) {
+ return <<___;
+ # Third vector
+ vmovdqu 2*32($ghashdata_ptr), $tmp1
+ vpshufb $bswap_mask, $tmp1, $tmp1
+ vmovdqu 2*32($htable), $tmp2
+___
+ }
+ elsif ( $i == 4 ) {
+ return <<___;
+ vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0
+ vpxor $tmp0, $lo, $lo
+ vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0
+ vpxor $tmp0, $hi, $hi
+___
+ }
+ elsif ( $i == 5 ) {
+ return <<___;
+ vpunpckhqdq $tmp1, $tmp1, $tmp0
+ vpxor $tmp1, $tmp0, $tmp0
+ vpclmulqdq \$0x00, $h_pow1_xored, $tmp0, $tmp0
+ vpxor $tmp0, $mi, $mi
+
+ # Fourth vector
+ vmovdqu 3*32($ghashdata_ptr), $tmp1
+ vpshufb $bswap_mask, $tmp1, $tmp1
+___
+ }
+ elsif ( $i == 6 ) {
+ return <<___;
+ vmovdqu 3*32($htable), $tmp2
+ vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0
+ vpxor $tmp0, $lo, $lo
+ vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0
+ vpxor $tmp0, $hi, $hi
+ vpunpckhqdq $tmp1, $tmp1, $tmp0
+ vpxor $tmp1, $tmp0, $tmp0
+ vpclmulqdq \$0x10, $h_pow1_xored, $tmp0, $tmp0
+ vpxor $tmp0, $mi, $mi
+___
+ }
+ elsif ( $i == 7 ) {
+ return <<___;
+ # Finalize 'mi' following Karatsuba multiplication.
+ vpxor $lo, $mi, $mi
+ vpxor $hi, $mi, $mi
+
+ # Fold lo into mi.
+ vbroadcasti128 .Lgfpoly(%rip), $tmp2
+ vpclmulqdq \$0x01, $lo, $tmp2, $tmp0
+ vpshufd \$0x4e, $lo, $lo
+ vpxor $lo, $mi, $mi
+ vpxor $tmp0, $mi, $mi
+___
+ }
+ elsif ( $i == 8 ) {
+ return <<___;
+ # Fold mi into hi.
+ vpclmulqdq \$0x01, $mi, $tmp2, $tmp0
+ vpshufd \$0x4e, $mi, $mi
+ vpxor $mi, $hi, $hi
+ vpxor $tmp0, $hi, $hi
+___
+ }
+ elsif ( $i == 9 ) {
+ return <<___;
+ vextracti128 \$1, $hi, $tmp0_xmm
+ vpxor $tmp0_xmm, $hi_xmm, $ghash_acc_xmm
+___
+ }
+}
+
+sub _ghash_4x {
+ my $code = "";
+ for my $i ( 0 .. 9 ) {
+ $code .= _ghash_step_4x $i, @_;
+ }
+ return $code;
+}
+
+# void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]);
+$code .= _begin_func "gcm_gmult_vpclmulqdq_avx2", 1;
+{
+ my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ];
+ my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
+ map( "%xmm$_", ( 0 .. 6 ) );
+
+ $code .= <<___;
+ @{[ _save_xmmregs (6) ]}
+ .seh_endprologue
+
+ vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC
+ vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK
+ vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
+ vmovdqu .Lgfpoly(%rip), $GFPOLY
+ vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
+
+ @{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}
+
+ vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
+ vmovdqu $GHASH_ACC, ($GHASH_ACC_PTR)
+___
+}
+$code .= _end_func;
+
+# void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16],
+# const uint8_t *in, size_t len);
+#
+# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
+# by |in| and |len|. |len| must be a multiple of 16.
+#
+# This function handles large amounts of AAD efficiently, while also keeping the
+# overhead low for small amounts of AAD which is the common case. TLS uses less
+# than one block of AAD, but (uncommonly) other use cases may use much more.
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2", 1;
+{
+ # Function arguments
+ my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
+
+ # Additional local variables
+ my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" );
+ my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" );
+ my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" );
+ my ( $LO, $LO_XMM ) = ( "%ymm3", "%xmm3" );
+ my ( $MI, $MI_XMM ) = ( "%ymm4", "%xmm4" );
+ my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm5", "%xmm5" );
+ my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" );
+ my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm7", "%xmm7" );
+ my $H_POW2_XORED = "%ymm8";
+ my $H_POW1_XORED = "%ymm9";
+
+ $code .= <<___;
+ @{[ _save_xmmregs (6 .. 9) ]}
+ .seh_endprologue
+
+ vbroadcasti128 .Lbswap_mask(%rip), $BSWAP_MASK
+ vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM
+ vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+ vbroadcasti128 .Lgfpoly(%rip), $GFPOLY
+
+ # Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128.
+ cmp \$32, $AADLEN
+ jb .Lghash_lastblock
+
+ cmp \$127, $AADLEN
+ jbe .Lghash_loop_1x
+
+ # Update GHASH with 128 bytes of AAD at a time.
+ vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED
+ vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED
+.Lghash_loop_4x:
+ @{[ _ghash_4x $AAD, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, $H_POW1_XORED,
+ $TMP0, $TMP0_XMM, $TMP1, $TMP2, $LO, $MI, $GHASH_ACC,
+ $GHASH_ACC_XMM ]}
+ sub \$-128, $AAD # 128 is 4 bytes, -128 is 1 byte
+ add \$-128, $AADLEN
+ cmp \$127, $AADLEN
+ ja .Lghash_loop_4x
+
+ # Update GHASH with 32 bytes of AAD at a time.
+ cmp \$32, $AADLEN
+ jb .Lghash_loop_1x_done
+.Lghash_loop_1x:
+ vmovdqu ($AAD), $TMP0
+ vpshufb $BSWAP_MASK, $TMP0, $TMP0
+ vpxor $TMP0, $GHASH_ACC, $GHASH_ACC
+ vmovdqu $OFFSETOFEND_H_POWERS-32($HTABLE), $TMP0
+ @{[ _ghash_mul $TMP0, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $TMP1, $TMP2, $LO ]}
+ vextracti128 \$1, $GHASH_ACC, $TMP0_XMM
+ vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+ add \$32, $AAD
+ sub \$32, $AADLEN
+ cmp \$32, $AADLEN
+ jae .Lghash_loop_1x
+.Lghash_loop_1x_done:
+ # Issue the vzeroupper that is needed after using ymm registers. Do it here
+ # instead of at the end, to minimize overhead for small AADLEN.
+ vzeroupper
+
+ # Update GHASH with the remaining 16-byte block if any.
+.Lghash_lastblock:
+ test $AADLEN, $AADLEN
+ jz .Lghash_done
+ vmovdqu ($AAD), $TMP0_XMM
+ vpshufb $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM
+ vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+ vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM
+ @{[ _ghash_mul $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
+ $TMP1_XMM, $TMP2_XMM, $LO_XMM ]}
+
+.Lghash_done:
+ # Store the updated GHASH accumulator back to memory.
+ vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+ vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+___
+}
+$code .= _end_func;
+
+sub _vaesenc_4x {
+ my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_;
+ return <<___;
+ vaesenc $round_key, $aesdata0, $aesdata0
+ vaesenc $round_key, $aesdata1, $aesdata1
+ vaesenc $round_key, $aesdata2, $aesdata2
+ vaesenc $round_key, $aesdata3, $aesdata3
+___
+}
+
+sub _ctr_begin_4x {
+ my (
+ $le_ctr, $bswap_mask, $rndkey0, $aesdata0,
+ $aesdata1, $aesdata2, $aesdata3, $tmp
+ ) = @_;
+ return <<___;
+ # Increment le_ctr four times to generate four vectors of little-endian
+ # counter blocks, swap each to big-endian, and store them in aesdata[0-3].
+ vmovdqu .Linc_2blocks(%rip), $tmp
+ vpshufb $bswap_mask, $le_ctr, $aesdata0
+ vpaddd $tmp, $le_ctr, $le_ctr
+ vpshufb $bswap_mask, $le_ctr, $aesdata1
+ vpaddd $tmp, $le_ctr, $le_ctr
+ vpshufb $bswap_mask, $le_ctr, $aesdata2
+ vpaddd $tmp, $le_ctr, $le_ctr
+ vpshufb $bswap_mask, $le_ctr, $aesdata3
+ vpaddd $tmp, $le_ctr, $le_ctr
+
+ # AES "round zero": XOR in the zero-th round key.
+ vpxor $rndkey0, $aesdata0, $aesdata0
+ vpxor $rndkey0, $aesdata1, $aesdata1
+ vpxor $rndkey0, $aesdata2, $aesdata2
+ vpxor $rndkey0, $aesdata3, $aesdata3
+___
+}
+
+# Do the last AES round for four vectors of counter blocks, XOR four vectors of
+# source data with the resulting keystream blocks, and write the result to the
+# destination buffer. The implementation differs slightly as it takes advantage
+# of the property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) to reduce
+# latency, but it has the same effect.
+sub _aesenclast_and_xor_4x {
+ my (
+ $src, $dst, $rndkeylast, $aesdata0,
+ $aesdata1, $aesdata2, $aesdata3, $t0,
+ $t1, $t2, $t3
+ ) = @_;
+ return <<___;
+ vpxor 0*32($src), $rndkeylast, $t0
+ vpxor 1*32($src), $rndkeylast, $t1
+ vpxor 2*32($src), $rndkeylast, $t2
+ vpxor 3*32($src), $rndkeylast, $t3
+ vaesenclast $t0, $aesdata0, $aesdata0
+ vaesenclast $t1, $aesdata1, $aesdata1
+ vaesenclast $t2, $aesdata2, $aesdata2
+ vaesenclast $t3, $aesdata3, $aesdata3
+ vmovdqu $aesdata0, 0*32($dst)
+ vmovdqu $aesdata1, 1*32($dst)
+ vmovdqu $aesdata2, 2*32($dst)
+ vmovdqu $aesdata3, 3*32($dst)
+___
+}
+
+my $g_update_macro_expansion_count = 0;
+
+# void aes_gcm_{enc,dec}_update_vaes_avx2(const uint8_t *in, uint8_t *out,
+# size_t len, const AES_KEY *key,
+# const uint8_t ivec[16],
+# const u128 Htable[16],
+# uint8_t Xi[16]);
+#
+# This macro generates a GCM encryption or decryption update function with the
+# above prototype (with \enc selecting which one). The function computes the
+# next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and
+# writes the resulting encrypted or decrypted data to |out|. It also updates
+# the GHASH accumulator |Xi| using the next |len| ciphertext bytes.
+#
+# |len| must be a multiple of 16. The caller must do any buffering needed to
+# ensure this. Both in-place and out-of-place en/decryption are supported.
+#
+# |ivec| must give the current counter in big-endian format. This function
+# loads the counter from |ivec| and increments the loaded counter as needed, but
+# it does *not* store the updated counter back to |ivec|. The caller must
+# update |ivec| if any more data segments follow. Internally, only the low
+# 32-bit word of the counter is incremented, following the GCM standard.
+sub _aes_gcm_update {
+ my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count;
+ my ($enc) = @_;
+ my $code = "";
+
+ # Function arguments
+ my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR )
+ = $win64
+ ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" )
+ : ( @argregs[ 0 .. 5 ], "%r12" );
+
+ # Additional local variables.
+ # %rax is used as a temporary register. BE_CTR_PTR is also available as a
+ # temporary register after the counter is loaded.
+
+ # AES key length in bytes
+ my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" );
+
+ # Pointer to the last AES round key for the chosen AES variant
+ my $RNDKEYLAST_PTR = "%r11";
+
+ # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+ # using vpshufb, copied to all 128-bit lanes.
+ my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm0", "%xmm0" );
+
+ # GHASH_ACC is the accumulator variable for GHASH. When fully reduced,
+ # only the lowest 128-bit lane can be nonzero. When not fully reduced,
+ # more than one lane may be used, and they need to be XOR'd together.
+ my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm1", "%xmm1" );
+
+ # TMP[0-2] are temporary registers.
+ my ( $TMP0, $TMP0_XMM ) = ( "%ymm2", "%xmm2" );
+ my ( $TMP1, $TMP1_XMM ) = ( "%ymm3", "%xmm3" );
+ my ( $TMP2, $TMP2_XMM ) = ( "%ymm4", "%xmm4" );
+
+ # LO and MI are used to accumulate unreduced GHASH products.
+ my ( $LO, $LO_XMM ) = ( "%ymm5", "%xmm5" );
+ my ( $MI, $MI_XMM ) = ( "%ymm6", "%xmm6" );
+
+ # Cached key powers from Htable
+ my ( $H_POW2_XORED, $H_POW2_XORED_XMM ) = ( "%ymm7", "%xmm7" );
+ my ( $H_POW1_XORED, $H_POW1_XORED_XMM ) = ( "%ymm8", "%xmm8" );
+
+ # RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one.
+ my $RNDKEY0 = "%ymm9";
+ my $RNDKEYLAST = "%ymm10";
+
+ # LE_CTR contains the next set of little-endian counter blocks.
+ my $LE_CTR = "%ymm11";
+
+ # AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
+ my ( $AESDATA0, $AESDATA0_XMM ) = ( "%ymm12", "%xmm12" );
+ my ( $AESDATA1, $AESDATA1_XMM ) = ( "%ymm13", "%xmm13" );
+ my ( $AESDATA2, $AESDATA2_XMM ) = ( "%ymm14", "%xmm14" );
+ my ( $AESDATA3, $AESDATA3_XMM ) = ( "%ymm15", "%xmm15" );
+ my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 );
+
+ my @ghash_4x_args = (
+ $enc ? $DST : $SRC, $HTABLE, $BSWAP_MASK, $H_POW2_XORED,
+ $H_POW1_XORED, $TMP0, $TMP0_XMM, $TMP1,
+ $TMP2, $LO, $MI, $GHASH_ACC,
+ $GHASH_ACC_XMM
+ );
+
+ if ($win64) {
+ $code .= <<___;
+ @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]}
+ mov 64(%rsp), $BE_CTR_PTR # arg5
+ mov 72(%rsp), $HTABLE # arg6
+ mov 80(%rsp), $GHASH_ACC_PTR # arg7
+ @{[ _save_xmmregs (6 .. 15) ]}
+ .seh_endprologue
+___
+ }
+ else {
+ $code .= <<___;
+ @{[ _save_gpregs $GHASH_ACC_PTR ]}
+ mov 16(%rsp), $GHASH_ACC_PTR # arg7
+___
+ }
+
+ if ($enc) {
+ $code .= <<___;
+#ifdef BORINGSSL_DISPATCH_TEST
+ .extern BORINGSSL_function_hit
+ movb \$1,BORINGSSL_function_hit+8(%rip)
+#endif
+___
+ }
+ $code .= <<___;
+ vbroadcasti128 .Lbswap_mask(%rip), $BSWAP_MASK
+
+ # Load the GHASH accumulator and the starting counter.
+ # BoringSSL passes these values in big endian format.
+ vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM
+ vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+ vbroadcasti128 ($BE_CTR_PTR), $LE_CTR
+ vpshufb $BSWAP_MASK, $LE_CTR, $LE_CTR
+
+ # Load the AES key length in bytes. BoringSSL stores number of rounds
+ # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20.
+ movl $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN
+ lea -20(,$AESKEYLEN,4), $AESKEYLEN
+
+ # Make RNDKEYLAST_PTR point to the last AES round key. This is the
+ # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+ # respectively. Then load the zero-th and last round keys.
+ lea 6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR
+ vbroadcasti128 ($AESKEY), $RNDKEY0
+ vbroadcasti128 ($RNDKEYLAST_PTR), $RNDKEYLAST
+
+ # Finish initializing LE_CTR by adding 1 to the second block.
+ vpaddd .Lctr_pattern(%rip), $LE_CTR, $LE_CTR
+
+ # If there are at least 128 bytes of data, then continue into the loop that
+ # processes 128 bytes of data at a time. Otherwise skip it.
+ cmp \$127, $DATALEN
+ jbe .Lcrypt_loop_4x_done$local_label_suffix
+
+ vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED
+ vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED
+___
+
+ # Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time.
+
+ if ($enc) {
+ $code .= <<___;
+ # Encrypt the first 4 vectors of plaintext blocks.
+ @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]}
+ lea 16($AESKEY), %rax
+.Lvaesenc_loop_first_4_vecs$local_label_suffix:
+ vbroadcasti128 (%rax), $TMP0
+ @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+ add \$16, %rax
+ cmp %rax, $RNDKEYLAST_PTR
+ jne .Lvaesenc_loop_first_4_vecs$local_label_suffix
+ @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA,
+ $TMP0, $TMP1, $LO, $MI ]}
+ sub \$-128, $SRC # 128 is 4 bytes, -128 is 1 byte
+ add \$-128, $DATALEN
+ cmp \$127, $DATALEN
+ jbe .Lghash_last_ciphertext_4x$local_label_suffix
+___
+ }
+
+ $code .= <<___;
+.align 16
+.Lcrypt_loop_4x$local_label_suffix:
+
+ # Start the AES encryption of the counter blocks.
+ @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]}
+ cmp \$24, $AESKEYLEN
+ jl .Laes128$local_label_suffix
+ je .Laes192$local_label_suffix
+ # AES-256
+ vbroadcasti128 -13*16($RNDKEYLAST_PTR), $TMP0
+ @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+ vbroadcasti128 -12*16($RNDKEYLAST_PTR), $TMP0
+ @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+.Laes192$local_label_suffix:
+ vbroadcasti128 -11*16($RNDKEYLAST_PTR), $TMP0
+ @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+ vbroadcasti128 -10*16($RNDKEYLAST_PTR), $TMP0
+ @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+.Laes128$local_label_suffix:
+___
+
+ # Finish the AES encryption of the counter blocks in AESDATA[0-3],
+ # interleaved with the GHASH update of the ciphertext blocks.
+ for my $i ( reverse 1 .. 9 ) {
+ $code .= <<___;
+ @{[ _ghash_step_4x 9-$i, @ghash_4x_args ]}
+ vbroadcasti128 -$i*16($RNDKEYLAST_PTR), $TMP0
+ @{[ _vaesenc_4x $TMP0, @AESDATA ]}
+___
+ }
+ $code .= <<___;
+ @{[ _ghash_step_4x 9, @ghash_4x_args ]}
+
+ @{[ $enc ? "sub \$-128, $DST" : "" ]} # 128 is 4 bytes, -128 is 1 byte
+ @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA,
+ $TMP0, $TMP1, $LO, $MI ]}
+ sub \$-128, $SRC
+ @{[ !$enc ? "sub \$-128, $DST" : "" ]}
+ add \$-128, $DATALEN
+ cmp \$127, $DATALEN
+ ja .Lcrypt_loop_4x$local_label_suffix
+___
+
+ if ($enc) {
+
+ # Update GHASH with the last set of ciphertext blocks.
+ $code .= <<___;
+.Lghash_last_ciphertext_4x$local_label_suffix:
+ @{[ _ghash_4x @ghash_4x_args ]}
+ sub \$-128, $DST
+___
+ }
+
+ my $POWERS_PTR = $BE_CTR_PTR; # BE_CTR_PTR is free to be reused.
+ my ( $HI, $HI_XMM ) = ( $H_POW2_XORED, $H_POW2_XORED_XMM ); # reuse
+
+ $code .= <<___;
+.Lcrypt_loop_4x_done$local_label_suffix:
+ # Check whether any data remains.
+ test $DATALEN, $DATALEN
+ jz .Ldone$local_label_suffix
+
+ # DATALEN is in [16, 32, 48, 64, 80, 96, 112].
+
+ # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+ # is the number of blocks that remain.
+ lea $OFFSETOFEND_H_POWERS($HTABLE), $POWERS_PTR
+ sub $DATALEN, $POWERS_PTR
+
+ # Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+ vpxor $LO_XMM, $LO_XMM, $LO_XMM
+ vpxor $MI_XMM, $MI_XMM, $MI_XMM
+ vpxor $HI_XMM, $HI_XMM, $HI_XMM
+
+ cmp \$64, $DATALEN
+ jb .Llessthan64bytes$local_label_suffix
+
+ # DATALEN is in [64, 80, 96, 112]. Encrypt two vectors of counter blocks.
+ vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0
+ vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
+ vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1
+ vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
+ vpxor $RNDKEY0, $AESDATA0, $AESDATA0
+ vpxor $RNDKEY0, $AESDATA1, $AESDATA1
+ lea 16($AESKEY), %rax
+.Lvaesenc_loop_tail_1$local_label_suffix:
+ vbroadcasti128 (%rax), $TMP0
+ vaesenc $TMP0, $AESDATA0, $AESDATA0
+ vaesenc $TMP0, $AESDATA1, $AESDATA1
+ add \$16, %rax
+ cmp %rax, $RNDKEYLAST_PTR
+ jne .Lvaesenc_loop_tail_1$local_label_suffix
+ vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0
+ vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1
+
+ # XOR the data with the two vectors of keystream blocks.
+ vmovdqu 0($SRC), $TMP0
+ vmovdqu 32($SRC), $TMP1
+ vpxor $TMP0, $AESDATA0, $AESDATA0
+ vpxor $TMP1, $AESDATA1, $AESDATA1
+ vmovdqu $AESDATA0, 0($DST)
+ vmovdqu $AESDATA1, 32($DST)
+
+ # Update GHASH with two vectors of ciphertext blocks, without reducing.
+ vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
+ vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA1 : $TMP1 ]}, $AESDATA1
+ vpxor $GHASH_ACC, $AESDATA0, $AESDATA0
+ vmovdqu ($POWERS_PTR), $TMP0
+ vmovdqu 32($POWERS_PTR), $TMP1
+ vpclmulqdq \$0x00, $TMP0, $AESDATA0, $LO
+ vpclmulqdq \$0x01, $TMP0, $AESDATA0, $MI
+ vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2
+ vpxor $TMP2, $MI, $MI
+ vpclmulqdq \$0x11, $TMP0, $AESDATA0, $HI
+ vpclmulqdq \$0x00, $TMP1, $AESDATA1, $TMP2
+ vpxor $TMP2, $LO, $LO
+ vpclmulqdq \$0x01, $TMP1, $AESDATA1, $TMP2
+ vpxor $TMP2, $MI, $MI
+ vpclmulqdq \$0x10, $TMP1, $AESDATA1, $TMP2
+ vpxor $TMP2, $MI, $MI
+ vpclmulqdq \$0x11, $TMP1, $AESDATA1, $TMP2
+ vpxor $TMP2, $HI, $HI
+
+ add \$64, $POWERS_PTR
+ add \$64, $SRC
+ add \$64, $DST
+ sub \$64, $DATALEN
+ jz .Lreduce$local_label_suffix
+
+ vpxor $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+
+ # DATALEN is in [16, 32, 48]. Encrypt two last vectors of counter blocks.
+.Llessthan64bytes$local_label_suffix:
+ vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0
+ vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
+ vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1
+ vpxor $RNDKEY0, $AESDATA0, $AESDATA0
+ vpxor $RNDKEY0, $AESDATA1, $AESDATA1
+ lea 16($AESKEY), %rax
+.Lvaesenc_loop_tail_2$local_label_suffix:
+ vbroadcasti128 (%rax), $TMP0
+ vaesenc $TMP0, $AESDATA0, $AESDATA0
+ vaesenc $TMP0, $AESDATA1, $AESDATA1
+ add \$16, %rax
+ cmp %rax, $RNDKEYLAST_PTR
+ jne .Lvaesenc_loop_tail_2$local_label_suffix
+ vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0
+ vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1
+
+ # XOR the remaining data with the keystream blocks, and update GHASH with
+ # the remaining ciphertext blocks without reducing.
+
+ cmp \$32, $DATALEN
+ jb .Lxor_one_block$local_label_suffix
+ je .Lxor_two_blocks$local_label_suffix
+
+.Lxor_three_blocks$local_label_suffix:
+ vmovdqu 0($SRC), $TMP0
+ vmovdqu 32($SRC), $TMP1_XMM
+ vpxor $TMP0, $AESDATA0, $AESDATA0
+ vpxor $TMP1_XMM, $AESDATA1_XMM, $AESDATA1_XMM
+ vmovdqu $AESDATA0, 0($DST)
+ vmovdqu $AESDATA1_XMM, 32($DST)
+
+ vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
+ vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA1_XMM : $TMP1_XMM ]}, $AESDATA1_XMM
+ vpxor $GHASH_ACC, $AESDATA0, $AESDATA0
+ vmovdqu ($POWERS_PTR), $TMP0
+ vmovdqu 32($POWERS_PTR), $TMP1_XMM
+ vpclmulqdq \$0x00, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+ vpxor $TMP2, $LO, $LO
+ vpclmulqdq \$0x01, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+ vpxor $TMP2, $MI, $MI
+ vpclmulqdq \$0x10, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+ vpxor $TMP2, $MI, $MI
+ vpclmulqdq \$0x11, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
+ vpxor $TMP2, $HI, $HI
+ jmp .Lghash_mul_one_vec_unreduced$local_label_suffix
+
+.Lxor_two_blocks$local_label_suffix:
+ vmovdqu ($SRC), $TMP0
+ vpxor $TMP0, $AESDATA0, $AESDATA0
+ vmovdqu $AESDATA0, ($DST)
+ vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
+ vpxor $GHASH_ACC, $AESDATA0, $AESDATA0
+ vmovdqu ($POWERS_PTR), $TMP0
+ jmp .Lghash_mul_one_vec_unreduced$local_label_suffix
+
+.Lxor_one_block$local_label_suffix:
+ vmovdqu ($SRC), $TMP0_XMM
+ vpxor $TMP0_XMM, $AESDATA0_XMM, $AESDATA0_XMM
+ vmovdqu $AESDATA0_XMM, ($DST)
+ vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA0_XMM : $TMP0_XMM ]}, $AESDATA0_XMM
+ vpxor $GHASH_ACC_XMM, $AESDATA0_XMM, $AESDATA0_XMM
+ vmovdqu ($POWERS_PTR), $TMP0_XMM
+
+.Lghash_mul_one_vec_unreduced$local_label_suffix:
+ vpclmulqdq \$0x00, $TMP0, $AESDATA0, $TMP2
+ vpxor $TMP2, $LO, $LO
+ vpclmulqdq \$0x01, $TMP0, $AESDATA0, $TMP2
+ vpxor $TMP2, $MI, $MI
+ vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2
+ vpxor $TMP2, $MI, $MI
+ vpclmulqdq \$0x11, $TMP0, $AESDATA0, $TMP2
+ vpxor $TMP2, $HI, $HI
+
+.Lreduce$local_label_suffix:
+ # Finally, do the GHASH reduction.
+ vbroadcasti128 .Lgfpoly(%rip), $TMP0
+ vpclmulqdq \$0x01, $LO, $TMP0, $TMP1
+ vpshufd \$0x4e, $LO, $LO
+ vpxor $LO, $MI, $MI
+ vpxor $TMP1, $MI, $MI
+ vpclmulqdq \$0x01, $MI, $TMP0, $TMP1
+ vpshufd \$0x4e, $MI, $MI
+ vpxor $MI, $HI, $HI
+ vpxor $TMP1, $HI, $HI
+ vextracti128 \$1, $HI, $GHASH_ACC_XMM
+ vpxor $HI_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+
+.Ldone$local_label_suffix:
+ # Store the updated GHASH accumulator back to memory.
+ vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
+ vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+ vzeroupper
+___
+ return $code;
+}
+
+$code .= _begin_func "aes_gcm_enc_update_vaes_avx2", 1;
+$code .= _aes_gcm_update 1;
+$code .= _end_func;
+
+$code .= _begin_func "aes_gcm_dec_update_vaes_avx2", 1;
+$code .= _aes_gcm_update 0;
+$code .= _end_func;
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
+exit 0;
diff --git a/crypto/fipsmodule/modes/gcm.cc.inc b/crypto/fipsmodule/modes/gcm.cc.inc
index d3c829a..e77c525 100644
--- a/crypto/fipsmodule/modes/gcm.cc.inc
+++ b/crypto/fipsmodule/modes/gcm.cc.inc
@@ -99,6 +99,11 @@
uint8_t Xi[16], const u128 Htable[16],
enum gcm_impl_t impl) {
switch (impl) {
+ case gcm_x86_vaes_avx2:
+ len &= kSizeTWithoutLower4Bits;
+ aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec, Htable, Xi);
+ CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+ return len;
case gcm_x86_vaes_avx10_256:
len &= kSizeTWithoutLower4Bits;
aes_gcm_enc_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi);
@@ -119,6 +124,11 @@
uint8_t Xi[16], const u128 Htable[16],
enum gcm_impl_t impl) {
switch (impl) {
+ case gcm_x86_vaes_avx2:
+ len &= kSizeTWithoutLower4Bits;
+ aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec, Htable, Xi);
+ CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16);
+ return len;
case gcm_x86_vaes_avx10_256:
len &= kSizeTWithoutLower4Bits;
aes_gcm_dec_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi);
@@ -171,15 +181,21 @@
#if defined(GHASH_ASM_X86_64)
if (crypto_gcm_clmul_enabled()) {
- if (CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
- CRYPTO_is_VPCLMULQDQ_capable() && CRYPTO_is_BMI2_capable()) {
- gcm_init_vpclmulqdq_avx10(out_table, H);
- *out_mult = gcm_gmult_vpclmulqdq_avx10;
- if (CRYPTO_cpu_avoid_zmm_registers()) {
- *out_hash = gcm_ghash_vpclmulqdq_avx10_256;
- } else {
- *out_hash = gcm_ghash_vpclmulqdq_avx10_512;
+ if (CRYPTO_is_VPCLMULQDQ_capable() && CRYPTO_is_AVX2_capable()) {
+ if (CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
+ CRYPTO_is_BMI2_capable()) {
+ gcm_init_vpclmulqdq_avx10(out_table, H);
+ *out_mult = gcm_gmult_vpclmulqdq_avx10;
+ if (CRYPTO_cpu_avoid_zmm_registers()) {
+ *out_hash = gcm_ghash_vpclmulqdq_avx10_256;
+ } else {
+ *out_hash = gcm_ghash_vpclmulqdq_avx10_512;
+ }
+ return;
}
+ gcm_init_vpclmulqdq_avx2(out_table, H);
+ *out_mult = gcm_gmult_vpclmulqdq_avx2;
+ *out_hash = gcm_ghash_vpclmulqdq_avx2;
return;
}
if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
@@ -265,6 +281,9 @@
} else if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx10_512 &&
CRYPTO_is_VAES_capable()) {
gcm_key->impl = gcm_x86_vaes_avx10_512;
+ } else if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx2 &&
+ CRYPTO_is_VAES_capable()) {
+ gcm_key->impl = gcm_x86_vaes_avx2;
} else if (gcm_key->ghash == gcm_ghash_avx && is_hwaes) {
gcm_key->impl = gcm_x86_aesni;
}
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index fafde9c..d195526 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -82,6 +82,29 @@
}
}
if (CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() &&
+ CRYPTO_is_AVX2_capable()) {
+ AES_KEY aes_key;
+ static const uint8_t kKey[16] = {0};
+ uint8_t iv[16] = {0};
+
+ CHECK_ABI_SEH(gcm_init_vpclmulqdq_avx2, Htable, kH);
+ CHECK_ABI_SEH(gcm_gmult_vpclmulqdq_avx2, X, Htable);
+ for (size_t blocks : kBlockCounts) {
+ CHECK_ABI_SEH(gcm_ghash_vpclmulqdq_avx2, X, Htable, buf, 16 * blocks);
+ }
+
+ aes_hw_set_encrypt_key(kKey, 128, &aes_key);
+ for (size_t blocks : kBlockCounts) {
+ CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx2, buf, buf, blocks * 16,
+ &aes_key, iv, Htable, X);
+ }
+ aes_hw_set_decrypt_key(kKey, 128, &aes_key);
+ for (size_t blocks : kBlockCounts) {
+ CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx2, buf, buf, blocks * 16,
+ &aes_key, iv, Htable, X);
+ }
+ }
+ if (CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() &&
CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
CRYPTO_is_BMI2_capable()) {
AES_KEY aes_key;
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index a1f7bf5..f041bf8 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -69,6 +69,7 @@
enum gcm_impl_t {
gcm_separate = 0, // No combined AES-GCM, but may have AES-CTR and GHASH.
gcm_x86_aesni,
+ gcm_x86_vaes_avx2,
gcm_x86_vaes_avx10_256,
gcm_x86_vaes_avx10_512,
gcm_arm64_aes,
@@ -200,6 +201,17 @@
const AES_KEY *key, uint8_t ivec[16],
const u128 Htable[16], uint8_t Xi[16]);
+void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]);
+void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16],
+ const uint8_t *in, size_t len);
+void aes_gcm_enc_update_vaes_avx2(const uint8_t *in, uint8_t *out, size_t len,
+ const AES_KEY *key, const uint8_t ivec[16],
+ const u128 Htable[16], uint8_t Xi[16]);
+void aes_gcm_dec_update_vaes_avx2(const uint8_t *in, uint8_t *out, size_t len,
+ const AES_KEY *key, const uint8_t ivec[16],
+ const u128 Htable[16], uint8_t Xi[16]);
+
void gcm_init_vpclmulqdq_avx10(u128 Htable[16], const uint64_t H[2]);
void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
void gcm_ghash_vpclmulqdq_avx10_256(uint8_t Xi[16], const u128 Htable[16],
diff --git a/crypto/impl_dispatch_test.cc b/crypto/impl_dispatch_test.cc
index 8c8d1d1..bfd0045 100644
--- a/crypto/impl_dispatch_test.cc
+++ b/crypto/impl_dispatch_test.cc
@@ -37,8 +37,9 @@
avx_movbe_ = CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable();
ssse3_ = CRYPTO_is_SSSE3_capable();
vaes_ = CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() &&
- CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
- CRYPTO_is_BMI2_capable();
+ CRYPTO_is_AVX2_capable();
+ avx10_ = CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() &&
+ CRYPTO_is_BMI2_capable();
avoid_zmm_ = CRYPTO_cpu_avoid_zmm_registers();
is_x86_64_ =
#if defined(OPENSSL_X86_64)
@@ -80,6 +81,7 @@
bool ssse3_ = false;
bool is_x86_64_ = false;
bool vaes_ = false;
+ bool avx10_ = false;
bool avoid_zmm_ = false;
#endif
};
@@ -95,6 +97,7 @@
constexpr size_t kFlag_vpaes_set_encrypt_key = 5;
constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_256 = 6;
constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_512 = 7;
+constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx2 = 8;
TEST_F(ImplDispatchTest, AEAD_AES_GCM) {
AssertFunctionsHit(
@@ -107,9 +110,10 @@
{kFlag_vpaes_encrypt, ssse3_ && !aesni_},
{kFlag_vpaes_set_encrypt_key, ssse3_ && !aesni_},
{kFlag_aes_gcm_enc_update_vaes_avx10_256,
- is_x86_64_ && vaes_ && avoid_zmm_},
+ is_x86_64_ && vaes_ && avx10_ && avoid_zmm_},
{kFlag_aes_gcm_enc_update_vaes_avx10_512,
- is_x86_64_ && vaes_ && !avoid_zmm_},
+ is_x86_64_ && vaes_ && avx10_ && !avoid_zmm_},
+ {kFlag_aes_gcm_enc_update_vaes_avx2, is_x86_64_ && vaes_ && !avx10_},
},
[] {
const uint8_t kZeros[16] = {0};
diff --git a/crypto/internal.h b/crypto/internal.h
index d50e755..62273c6 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -1410,7 +1410,8 @@
// 5: vpaes_set_encrypt_key
// 6: aes_gcm_enc_update_vaes_avx10_256
// 7: aes_gcm_enc_update_vaes_avx10_512
-extern uint8_t BORINGSSL_function_hit[8];
+// 8: aes_gcm_enc_update_vaes_avx2
+extern uint8_t BORINGSSL_function_hit[9];
#endif // BORINGSSL_DISPATCH_TEST
// OPENSSL_vasprintf_internal is just like |vasprintf(3)|. If |system_malloc| is
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-apple.S b/gen/bcm/aes-gcm-avx2-x86_64-apple.S
new file mode 100644
index 0000000..e401e66
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx2-x86_64-apple.S
@@ -0,0 +1,1309 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.section __DATA,__const
+.p2align 4
+
+
+L$bswap_mask:
+.quad 0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+L$gfpoly:
+.quad 1, 0xc200000000000000
+
+
+L$gfpoly_and_internal_carrybit:
+.quad 1, 0xc200000000000001
+
+.p2align 5
+
+L$ctr_pattern:
+.quad 0, 0
+.quad 1, 0
+L$inc_2blocks:
+.quad 2, 0
+.quad 2, 0
+
+.text
+.globl _gcm_init_vpclmulqdq_avx2
+.private_extern _gcm_init_vpclmulqdq_avx2
+
+.p2align 5
+_gcm_init_vpclmulqdq_avx2:
+
+
+_CET_ENDBR
+
+
+
+
+
+ vpshufd $0x4e,(%rsi),%xmm3
+
+
+
+
+
+ vpshufd $0xd3,%xmm3,%xmm0
+ vpsrad $31,%xmm0,%xmm0
+ vpaddq %xmm3,%xmm3,%xmm3
+ vpand L$gfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vbroadcasti128 L$gfpoly(%rip),%ymm6
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0
+ vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1
+ vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm0,%xmm6,%xmm2
+ vpshufd $0x4e,%xmm0,%xmm0
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5
+ vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0
+ vpshufd $0x4e,%xmm1,%xmm1
+ vpxor %xmm1,%xmm5,%xmm5
+ vpxor %xmm0,%xmm5,%xmm5
+
+
+
+ vinserti128 $1,%xmm3,%ymm5,%ymm3
+ vinserti128 $1,%xmm5,%ymm5,%ymm5
+
+
+ vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0
+ vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1
+ vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2
+ vpshufd $0x4e,%ymm0,%ymm0
+ vpxor %ymm0,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4
+ vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpxor %ymm1,%ymm4,%ymm4
+ vpxor %ymm0,%ymm4,%ymm4
+
+
+
+ vmovdqu %ymm3,96(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+
+
+
+ vpunpcklqdq %ymm3,%ymm4,%ymm0
+ vpunpckhqdq %ymm3,%ymm4,%ymm1
+ vpxor %ymm1,%ymm0,%ymm0
+ vmovdqu %ymm0,128+32(%rdi)
+
+
+ vpclmulqdq $0x00,%ymm5,%ymm4,%ymm0
+ vpclmulqdq $0x01,%ymm5,%ymm4,%ymm1
+ vpclmulqdq $0x10,%ymm5,%ymm4,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2
+ vpshufd $0x4e,%ymm0,%ymm0
+ vpxor %ymm0,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x11,%ymm5,%ymm4,%ymm3
+ vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpxor %ymm1,%ymm3,%ymm3
+ vpxor %ymm0,%ymm3,%ymm3
+
+ vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0
+ vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1
+ vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2
+ vpshufd $0x4e,%ymm0,%ymm0
+ vpxor %ymm0,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4
+ vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpxor %ymm1,%ymm4,%ymm4
+ vpxor %ymm0,%ymm4,%ymm4
+
+ vmovdqu %ymm3,32(%rdi)
+ vmovdqu %ymm4,0(%rdi)
+
+
+
+ vpunpcklqdq %ymm3,%ymm4,%ymm0
+ vpunpckhqdq %ymm3,%ymm4,%ymm1
+ vpxor %ymm1,%ymm0,%ymm0
+ vmovdqu %ymm0,128(%rdi)
+
+ vzeroupper
+ ret
+
+
+
+.globl _gcm_gmult_vpclmulqdq_avx2
+.private_extern _gcm_gmult_vpclmulqdq_avx2
+
+.p2align 5
+_gcm_gmult_vpclmulqdq_avx2:
+
+
+_CET_ENDBR
+
+
+
+ vmovdqu (%rdi),%xmm0
+ vmovdqu L$bswap_mask(%rip),%xmm1
+ vmovdqu 128-16(%rsi),%xmm2
+ vmovdqu L$gfpoly(%rip),%xmm3
+ vpshufb %xmm1,%xmm0,%xmm0
+
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6
+ vpshufd $0x4e,%xmm4,%xmm4
+ vpxor %xmm4,%xmm5,%xmm5
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4
+ vpshufd $0x4e,%xmm5,%xmm5
+ vpxor %xmm5,%xmm0,%xmm0
+ vpxor %xmm4,%xmm0,%xmm0
+
+
+ vpshufb %xmm1,%xmm0,%xmm0
+ vmovdqu %xmm0,(%rdi)
+ ret
+
+
+
+.globl _gcm_ghash_vpclmulqdq_avx2
+.private_extern _gcm_ghash_vpclmulqdq_avx2
+
+.p2align 5
+_gcm_ghash_vpclmulqdq_avx2:
+
+
+_CET_ENDBR
+
+
+
+ vbroadcasti128 L$bswap_mask(%rip),%ymm6
+ vmovdqu (%rdi),%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vbroadcasti128 L$gfpoly(%rip),%ymm7
+
+
+ cmpq $32,%rcx
+ jb L$ghash_lastblock
+
+ cmpq $127,%rcx
+ jbe L$ghash_loop_1x
+
+
+ vmovdqu 128(%rsi),%ymm8
+ vmovdqu 128+32(%rsi),%ymm9
+L$ghash_loop_4x:
+
+ vmovdqu 0(%rdx),%ymm1
+ vpshufb %ymm6,%ymm1,%ymm1
+ vmovdqu 0(%rsi),%ymm2
+ vpxor %ymm5,%ymm1,%ymm1
+ vpclmulqdq $0x00,%ymm2,%ymm1,%ymm3
+ vpclmulqdq $0x11,%ymm2,%ymm1,%ymm5
+ vpunpckhqdq %ymm1,%ymm1,%ymm0
+ vpxor %ymm1,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm8,%ymm0,%ymm4
+
+ vmovdqu 32(%rdx),%ymm1
+ vpshufb %ymm6,%ymm1,%ymm1
+ vmovdqu 32(%rsi),%ymm2
+ vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm5,%ymm5
+ vpunpckhqdq %ymm1,%ymm1,%ymm0
+ vpxor %ymm1,%ymm0,%ymm0
+ vpclmulqdq $0x10,%ymm8,%ymm0,%ymm0
+ vpxor %ymm0,%ymm4,%ymm4
+
+ vmovdqu 64(%rdx),%ymm1
+ vpshufb %ymm6,%ymm1,%ymm1
+ vmovdqu 64(%rsi),%ymm2
+ vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm5,%ymm5
+ vpunpckhqdq %ymm1,%ymm1,%ymm0
+ vpxor %ymm1,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm9,%ymm0,%ymm0
+ vpxor %ymm0,%ymm4,%ymm4
+
+
+ vmovdqu 96(%rdx),%ymm1
+ vpshufb %ymm6,%ymm1,%ymm1
+ vmovdqu 96(%rsi),%ymm2
+ vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm5,%ymm5
+ vpunpckhqdq %ymm1,%ymm1,%ymm0
+ vpxor %ymm1,%ymm0,%ymm0
+ vpclmulqdq $0x10,%ymm9,%ymm0,%ymm0
+ vpxor %ymm0,%ymm4,%ymm4
+
+ vpxor %ymm3,%ymm4,%ymm4
+ vpxor %ymm5,%ymm4,%ymm4
+
+
+ vbroadcasti128 L$gfpoly(%rip),%ymm2
+ vpclmulqdq $0x01,%ymm3,%ymm2,%ymm0
+ vpshufd $0x4e,%ymm3,%ymm3
+ vpxor %ymm3,%ymm4,%ymm4
+ vpxor %ymm0,%ymm4,%ymm4
+
+ vpclmulqdq $0x01,%ymm4,%ymm2,%ymm0
+ vpshufd $0x4e,%ymm4,%ymm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm0,%ymm5,%ymm5
+ vextracti128 $1,%ymm5,%xmm0
+ vpxor %xmm0,%xmm5,%xmm5
+
+ subq $-128,%rdx
+ addq $-128,%rcx
+ cmpq $127,%rcx
+ ja L$ghash_loop_4x
+
+
+ cmpq $32,%rcx
+ jb L$ghash_loop_1x_done
+L$ghash_loop_1x:
+ vmovdqu (%rdx),%ymm0
+ vpshufb %ymm6,%ymm0,%ymm0
+ vpxor %ymm0,%ymm5,%ymm5
+ vmovdqu 128-32(%rsi),%ymm0
+ vpclmulqdq $0x00,%ymm0,%ymm5,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2
+ vpclmulqdq $0x10,%ymm0,%ymm5,%ymm3
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x01,%ymm1,%ymm7,%ymm3
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpxor %ymm1,%ymm2,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x11,%ymm0,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm2,%ymm7,%ymm1
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm1,%ymm5,%ymm5
+
+ vextracti128 $1,%ymm5,%xmm0
+ vpxor %xmm0,%xmm5,%xmm5
+ addq $32,%rdx
+ subq $32,%rcx
+ cmpq $32,%rcx
+ jae L$ghash_loop_1x
+L$ghash_loop_1x_done:
+
+
+ vzeroupper
+
+
+L$ghash_lastblock:
+ testq %rcx,%rcx
+ jz L$ghash_done
+ vmovdqu (%rdx),%xmm0
+ vpshufb %xmm6,%xmm0,%xmm0
+ vpxor %xmm0,%xmm5,%xmm5
+ vmovdqu 128-16(%rsi),%xmm0
+ vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1
+ vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2
+ vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3
+ vpxor %xmm3,%xmm2,%xmm2
+ vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3
+ vpshufd $0x4e,%xmm1,%xmm1
+ vpxor %xmm1,%xmm2,%xmm2
+ vpxor %xmm3,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5
+ vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1
+ vpshufd $0x4e,%xmm2,%xmm2
+ vpxor %xmm2,%xmm5,%xmm5
+ vpxor %xmm1,%xmm5,%xmm5
+
+
+L$ghash_done:
+
+ vpshufb %xmm6,%xmm5,%xmm5
+ vmovdqu %xmm5,(%rdi)
+ ret
+
+
+
+.globl _aes_gcm_enc_update_vaes_avx2
+.private_extern _aes_gcm_enc_update_vaes_avx2
+
+.p2align 5
+_aes_gcm_enc_update_vaes_avx2:
+
+
+_CET_ENDBR
+ pushq %r12
+
+
+ movq 16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+
+ movb $1,_BORINGSSL_function_hit+8(%rip)
+#endif
+ vbroadcasti128 L$bswap_mask(%rip),%ymm0
+
+
+
+ vmovdqu (%r12),%xmm1
+ vpshufb %xmm0,%xmm1,%xmm1
+ vbroadcasti128 (%r8),%ymm11
+ vpshufb %ymm0,%ymm11,%ymm11
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti128 (%rcx),%ymm9
+ vbroadcasti128 (%r11),%ymm10
+
+
+ vpaddd L$ctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+ cmpq $127,%rdx
+ jbe L$crypt_loop_4x_done__func1
+
+ vmovdqu 128(%r9),%ymm7
+ vmovdqu 128+32(%r9),%ymm8
+
+
+
+ vmovdqu L$inc_2blocks(%rip),%ymm2
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm14
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm15
+ vpaddd %ymm2,%ymm11,%ymm11
+
+
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ vpxor %ymm9,%ymm14,%ymm14
+ vpxor %ymm9,%ymm15,%ymm15
+
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_first_4_vecs__func1:
+ vbroadcasti128 (%rax),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_first_4_vecs__func1
+ vpxor 0(%rdi),%ymm10,%ymm2
+ vpxor 32(%rdi),%ymm10,%ymm3
+ vpxor 64(%rdi),%ymm10,%ymm5
+ vpxor 96(%rdi),%ymm10,%ymm6
+ vaesenclast %ymm2,%ymm12,%ymm12
+ vaesenclast %ymm3,%ymm13,%ymm13
+ vaesenclast %ymm5,%ymm14,%ymm14
+ vaesenclast %ymm6,%ymm15,%ymm15
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %ymm13,32(%rsi)
+ vmovdqu %ymm14,64(%rsi)
+ vmovdqu %ymm15,96(%rsi)
+
+ subq $-128,%rdi
+ addq $-128,%rdx
+ cmpq $127,%rdx
+ jbe L$ghash_last_ciphertext_4x__func1
+.p2align 4
+L$crypt_loop_4x__func1:
+
+
+
+
+ vmovdqu L$inc_2blocks(%rip),%ymm2
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm14
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm15
+ vpaddd %ymm2,%ymm11,%ymm11
+
+
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ vpxor %ymm9,%ymm14,%ymm14
+ vpxor %ymm9,%ymm15,%ymm15
+
+ cmpl $24,%r10d
+ jl L$aes128__func1
+ je L$aes192__func1
+
+ vbroadcasti128 -208(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vbroadcasti128 -192(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+L$aes192__func1:
+ vbroadcasti128 -176(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vbroadcasti128 -160(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+L$aes128__func1:
+
+ vmovdqu 0(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 0(%r9),%ymm4
+ vpxor %ymm1,%ymm3,%ymm3
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6
+
+ vbroadcasti128 -144(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vbroadcasti128 -128(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vmovdqu 32(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 32(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -112(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vmovdqu 64(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 64(%r9),%ymm4
+
+ vbroadcasti128 -96(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+
+ vbroadcasti128 -80(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+
+ vmovdqu 96(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+
+ vbroadcasti128 -64(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vmovdqu 96(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -48(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm1,%ymm6,%ymm6
+
+
+ vbroadcasti128 L$gfpoly(%rip),%ymm4
+ vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -32(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ vpxor %ymm6,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+
+ vbroadcasti128 -16(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vextracti128 $1,%ymm1,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+
+
+ subq $-128,%rsi
+ vpxor 0(%rdi),%ymm10,%ymm2
+ vpxor 32(%rdi),%ymm10,%ymm3
+ vpxor 64(%rdi),%ymm10,%ymm5
+ vpxor 96(%rdi),%ymm10,%ymm6
+ vaesenclast %ymm2,%ymm12,%ymm12
+ vaesenclast %ymm3,%ymm13,%ymm13
+ vaesenclast %ymm5,%ymm14,%ymm14
+ vaesenclast %ymm6,%ymm15,%ymm15
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %ymm13,32(%rsi)
+ vmovdqu %ymm14,64(%rsi)
+ vmovdqu %ymm15,96(%rsi)
+
+ subq $-128,%rdi
+
+ addq $-128,%rdx
+ cmpq $127,%rdx
+ ja L$crypt_loop_4x__func1
+L$ghash_last_ciphertext_4x__func1:
+
+ vmovdqu 0(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 0(%r9),%ymm4
+ vpxor %ymm1,%ymm3,%ymm3
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6
+
+ vmovdqu 32(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 32(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vmovdqu 64(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 64(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+
+ vmovdqu 96(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 96(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm1,%ymm6,%ymm6
+
+
+ vbroadcasti128 L$gfpoly(%rip),%ymm4
+ vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ vpxor %ymm6,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+ vextracti128 $1,%ymm1,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+
+ subq $-128,%rsi
+L$crypt_loop_4x_done__func1:
+
+ testq %rdx,%rdx
+ jz L$done__func1
+
+
+
+
+
+ leaq 128(%r9),%r8
+ subq %rdx,%r8
+
+
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+ vpxor %xmm7,%xmm7,%xmm7
+
+ cmpq $64,%rdx
+ jb L$lessthan64bytes__func1
+
+
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_1__func1:
+ vbroadcasti128 (%rax),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_1__func1
+ vaesenclast %ymm10,%ymm12,%ymm12
+ vaesenclast %ymm10,%ymm13,%ymm13
+
+
+ vmovdqu 0(%rdi),%ymm2
+ vmovdqu 32(%rdi),%ymm3
+ vpxor %ymm2,%ymm12,%ymm12
+ vpxor %ymm3,%ymm13,%ymm13
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %ymm13,32(%rsi)
+
+
+ vpshufb %ymm0,%ymm12,%ymm12
+ vpshufb %ymm0,%ymm13,%ymm13
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ vmovdqu 32(%r8),%ymm3
+ vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5
+ vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6
+ vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7
+ vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+
+ addq $64,%r8
+ addq $64,%rdi
+ addq $64,%rsi
+ subq $64,%rdx
+ jz L$reduce__func1
+
+ vpxor %xmm1,%xmm1,%xmm1
+
+
+L$lessthan64bytes__func1:
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_2__func1:
+ vbroadcasti128 (%rax),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_2__func1
+ vaesenclast %ymm10,%ymm12,%ymm12
+ vaesenclast %ymm10,%ymm13,%ymm13
+
+
+
+
+ cmpq $32,%rdx
+ jb L$xor_one_block__func1
+ je L$xor_two_blocks__func1
+
+L$xor_three_blocks__func1:
+ vmovdqu 0(%rdi),%ymm2
+ vmovdqu 32(%rdi),%xmm3
+ vpxor %ymm2,%ymm12,%ymm12
+ vpxor %xmm3,%xmm13,%xmm13
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %xmm13,32(%rsi)
+
+ vpshufb %ymm0,%ymm12,%ymm12
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ vmovdqu 32(%r8),%xmm3
+ vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm7,%ymm7
+ jmp L$ghash_mul_one_vec_unreduced__func1
+
+L$xor_two_blocks__func1:
+ vmovdqu (%rdi),%ymm2
+ vpxor %ymm2,%ymm12,%ymm12
+ vmovdqu %ymm12,(%rsi)
+ vpshufb %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ jmp L$ghash_mul_one_vec_unreduced__func1
+
+L$xor_one_block__func1:
+ vmovdqu (%rdi),%xmm2
+ vpxor %xmm2,%xmm12,%xmm12
+ vmovdqu %xmm12,(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm1,%xmm12,%xmm12
+ vmovdqu (%r8),%xmm2
+
+L$ghash_mul_one_vec_unreduced__func1:
+ vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+
+L$reduce__func1:
+
+ vbroadcasti128 L$gfpoly(%rip),%ymm2
+ vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3
+ vpshufd $0x4e,%ymm6,%ymm6
+ vpxor %ymm6,%ymm7,%ymm7
+ vpxor %ymm3,%ymm7,%ymm7
+ vextracti128 $1,%ymm7,%xmm1
+ vpxor %xmm7,%xmm1,%xmm1
+
+L$done__func1:
+
+ vpshufb %xmm0,%xmm1,%xmm1
+ vmovdqu %xmm1,(%r12)
+
+ vzeroupper
+ popq %r12
+
+ ret
+
+
+
+.globl _aes_gcm_dec_update_vaes_avx2
+.private_extern _aes_gcm_dec_update_vaes_avx2
+
+.p2align 5
+_aes_gcm_dec_update_vaes_avx2:
+
+
+_CET_ENDBR
+ pushq %r12
+
+
+ movq 16(%rsp),%r12
+ vbroadcasti128 L$bswap_mask(%rip),%ymm0
+
+
+
+ vmovdqu (%r12),%xmm1
+ vpshufb %xmm0,%xmm1,%xmm1
+ vbroadcasti128 (%r8),%ymm11
+ vpshufb %ymm0,%ymm11,%ymm11
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti128 (%rcx),%ymm9
+ vbroadcasti128 (%r11),%ymm10
+
+
+ vpaddd L$ctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+ cmpq $127,%rdx
+ jbe L$crypt_loop_4x_done__func2
+
+ vmovdqu 128(%r9),%ymm7
+ vmovdqu 128+32(%r9),%ymm8
+.p2align 4
+L$crypt_loop_4x__func2:
+
+
+
+
+ vmovdqu L$inc_2blocks(%rip),%ymm2
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm14
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm15
+ vpaddd %ymm2,%ymm11,%ymm11
+
+
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ vpxor %ymm9,%ymm14,%ymm14
+ vpxor %ymm9,%ymm15,%ymm15
+
+ cmpl $24,%r10d
+ jl L$aes128__func2
+ je L$aes192__func2
+
+ vbroadcasti128 -208(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vbroadcasti128 -192(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+L$aes192__func2:
+ vbroadcasti128 -176(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vbroadcasti128 -160(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+L$aes128__func2:
+
+ vmovdqu 0(%rdi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 0(%r9),%ymm4
+ vpxor %ymm1,%ymm3,%ymm3
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6
+
+ vbroadcasti128 -144(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vbroadcasti128 -128(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vmovdqu 32(%rdi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 32(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -112(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vmovdqu 64(%rdi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 64(%r9),%ymm4
+
+ vbroadcasti128 -96(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+
+ vbroadcasti128 -80(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+
+ vmovdqu 96(%rdi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+
+ vbroadcasti128 -64(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vmovdqu 96(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -48(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm1,%ymm6,%ymm6
+
+
+ vbroadcasti128 L$gfpoly(%rip),%ymm4
+ vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -32(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ vpxor %ymm6,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+
+ vbroadcasti128 -16(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vextracti128 $1,%ymm1,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+
+
+
+ vpxor 0(%rdi),%ymm10,%ymm2
+ vpxor 32(%rdi),%ymm10,%ymm3
+ vpxor 64(%rdi),%ymm10,%ymm5
+ vpxor 96(%rdi),%ymm10,%ymm6
+ vaesenclast %ymm2,%ymm12,%ymm12
+ vaesenclast %ymm3,%ymm13,%ymm13
+ vaesenclast %ymm5,%ymm14,%ymm14
+ vaesenclast %ymm6,%ymm15,%ymm15
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %ymm13,32(%rsi)
+ vmovdqu %ymm14,64(%rsi)
+ vmovdqu %ymm15,96(%rsi)
+
+ subq $-128,%rdi
+ subq $-128,%rsi
+ addq $-128,%rdx
+ cmpq $127,%rdx
+ ja L$crypt_loop_4x__func2
+L$crypt_loop_4x_done__func2:
+
+ testq %rdx,%rdx
+ jz L$done__func2
+
+
+
+
+
+ leaq 128(%r9),%r8
+ subq %rdx,%r8
+
+
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+ vpxor %xmm7,%xmm7,%xmm7
+
+ cmpq $64,%rdx
+ jb L$lessthan64bytes__func2
+
+
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_1__func2:
+ vbroadcasti128 (%rax),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_1__func2
+ vaesenclast %ymm10,%ymm12,%ymm12
+ vaesenclast %ymm10,%ymm13,%ymm13
+
+
+ vmovdqu 0(%rdi),%ymm2
+ vmovdqu 32(%rdi),%ymm3
+ vpxor %ymm2,%ymm12,%ymm12
+ vpxor %ymm3,%ymm13,%ymm13
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %ymm13,32(%rsi)
+
+
+ vpshufb %ymm0,%ymm2,%ymm12
+ vpshufb %ymm0,%ymm3,%ymm13
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ vmovdqu 32(%r8),%ymm3
+ vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5
+ vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6
+ vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7
+ vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+
+ addq $64,%r8
+ addq $64,%rdi
+ addq $64,%rsi
+ subq $64,%rdx
+ jz L$reduce__func2
+
+ vpxor %xmm1,%xmm1,%xmm1
+
+
+L$lessthan64bytes__func2:
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ leaq 16(%rcx),%rax
+L$vaesenc_loop_tail_2__func2:
+ vbroadcasti128 (%rax),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne L$vaesenc_loop_tail_2__func2
+ vaesenclast %ymm10,%ymm12,%ymm12
+ vaesenclast %ymm10,%ymm13,%ymm13
+
+
+
+
+ cmpq $32,%rdx
+ jb L$xor_one_block__func2
+ je L$xor_two_blocks__func2
+
+L$xor_three_blocks__func2:
+ vmovdqu 0(%rdi),%ymm2
+ vmovdqu 32(%rdi),%xmm3
+ vpxor %ymm2,%ymm12,%ymm12
+ vpxor %xmm3,%xmm13,%xmm13
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %xmm13,32(%rsi)
+
+ vpshufb %ymm0,%ymm2,%ymm12
+ vpshufb %xmm0,%xmm3,%xmm13
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ vmovdqu 32(%r8),%xmm3
+ vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm7,%ymm7
+ jmp L$ghash_mul_one_vec_unreduced__func2
+
+L$xor_two_blocks__func2:
+ vmovdqu (%rdi),%ymm2
+ vpxor %ymm2,%ymm12,%ymm12
+ vmovdqu %ymm12,(%rsi)
+ vpshufb %ymm0,%ymm2,%ymm12
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ jmp L$ghash_mul_one_vec_unreduced__func2
+
+L$xor_one_block__func2:
+ vmovdqu (%rdi),%xmm2
+ vpxor %xmm2,%xmm12,%xmm12
+ vmovdqu %xmm12,(%rsi)
+ vpshufb %xmm0,%xmm2,%xmm12
+ vpxor %xmm1,%xmm12,%xmm12
+ vmovdqu (%r8),%xmm2
+
+L$ghash_mul_one_vec_unreduced__func2:
+ vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+
+L$reduce__func2:
+
+ vbroadcasti128 L$gfpoly(%rip),%ymm2
+ vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3
+ vpshufd $0x4e,%ymm6,%ymm6
+ vpxor %ymm6,%ymm7,%ymm7
+ vpxor %ymm3,%ymm7,%ymm7
+ vextracti128 $1,%ymm7,%xmm1
+ vpxor %xmm7,%xmm1,%xmm1
+
+L$done__func2:
+
+ vpshufb %xmm0,%xmm1,%xmm1
+ vmovdqu %xmm1,(%r12)
+
+ vzeroupper
+ popq %r12
+
+ ret
+
+
+
+#endif
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-linux.S b/gen/bcm/aes-gcm-avx2-x86_64-linux.S
new file mode 100644
index 0000000..b7816cf
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx2-x86_64-linux.S
@@ -0,0 +1,1314 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.section .rodata
+.align 16
+
+
+.Lbswap_mask:
+.quad 0x08090a0b0c0d0e0f, 0x0001020304050607
+
+
+
+
+
+
+
+
+.Lgfpoly:
+.quad 1, 0xc200000000000000
+
+
+.Lgfpoly_and_internal_carrybit:
+.quad 1, 0xc200000000000001
+
+.align 32
+
+.Lctr_pattern:
+.quad 0, 0
+.quad 1, 0
+.Linc_2blocks:
+.quad 2, 0
+.quad 2, 0
+
+.text
+.globl gcm_init_vpclmulqdq_avx2
+.hidden gcm_init_vpclmulqdq_avx2
+.type gcm_init_vpclmulqdq_avx2,@function
+.align 32
+gcm_init_vpclmulqdq_avx2:
+.cfi_startproc
+
+_CET_ENDBR
+
+
+
+
+
+ vpshufd $0x4e,(%rsi),%xmm3
+
+
+
+
+
+ vpshufd $0xd3,%xmm3,%xmm0
+ vpsrad $31,%xmm0,%xmm0
+ vpaddq %xmm3,%xmm3,%xmm3
+ vpand .Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vbroadcasti128 .Lgfpoly(%rip),%ymm6
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0
+ vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1
+ vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm0,%xmm6,%xmm2
+ vpshufd $0x4e,%xmm0,%xmm0
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5
+ vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0
+ vpshufd $0x4e,%xmm1,%xmm1
+ vpxor %xmm1,%xmm5,%xmm5
+ vpxor %xmm0,%xmm5,%xmm5
+
+
+
+ vinserti128 $1,%xmm3,%ymm5,%ymm3
+ vinserti128 $1,%xmm5,%ymm5,%ymm5
+
+
+ vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0
+ vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1
+ vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2
+ vpshufd $0x4e,%ymm0,%ymm0
+ vpxor %ymm0,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4
+ vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpxor %ymm1,%ymm4,%ymm4
+ vpxor %ymm0,%ymm4,%ymm4
+
+
+
+ vmovdqu %ymm3,96(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+
+
+
+ vpunpcklqdq %ymm3,%ymm4,%ymm0
+ vpunpckhqdq %ymm3,%ymm4,%ymm1
+ vpxor %ymm1,%ymm0,%ymm0
+ vmovdqu %ymm0,128+32(%rdi)
+
+
+ vpclmulqdq $0x00,%ymm5,%ymm4,%ymm0
+ vpclmulqdq $0x01,%ymm5,%ymm4,%ymm1
+ vpclmulqdq $0x10,%ymm5,%ymm4,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2
+ vpshufd $0x4e,%ymm0,%ymm0
+ vpxor %ymm0,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x11,%ymm5,%ymm4,%ymm3
+ vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpxor %ymm1,%ymm3,%ymm3
+ vpxor %ymm0,%ymm3,%ymm3
+
+ vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0
+ vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1
+ vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2
+ vpshufd $0x4e,%ymm0,%ymm0
+ vpxor %ymm0,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+ vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4
+ vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpxor %ymm1,%ymm4,%ymm4
+ vpxor %ymm0,%ymm4,%ymm4
+
+ vmovdqu %ymm3,32(%rdi)
+ vmovdqu %ymm4,0(%rdi)
+
+
+
+ vpunpcklqdq %ymm3,%ymm4,%ymm0
+ vpunpckhqdq %ymm3,%ymm4,%ymm1
+ vpxor %ymm1,%ymm0,%ymm0
+ vmovdqu %ymm0,128(%rdi)
+
+ vzeroupper
+ ret
+
+.cfi_endproc
+.size gcm_init_vpclmulqdq_avx2, . - gcm_init_vpclmulqdq_avx2
+.globl gcm_gmult_vpclmulqdq_avx2
+.hidden gcm_gmult_vpclmulqdq_avx2
+.type gcm_gmult_vpclmulqdq_avx2,@function
+.align 32
+gcm_gmult_vpclmulqdq_avx2:
+.cfi_startproc
+
+_CET_ENDBR
+
+
+
+ vmovdqu (%rdi),%xmm0
+ vmovdqu .Lbswap_mask(%rip),%xmm1
+ vmovdqu 128-16(%rsi),%xmm2
+ vmovdqu .Lgfpoly(%rip),%xmm3
+ vpshufb %xmm1,%xmm0,%xmm0
+
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6
+ vpshufd $0x4e,%xmm4,%xmm4
+ vpxor %xmm4,%xmm5,%xmm5
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4
+ vpshufd $0x4e,%xmm5,%xmm5
+ vpxor %xmm5,%xmm0,%xmm0
+ vpxor %xmm4,%xmm0,%xmm0
+
+
+ vpshufb %xmm1,%xmm0,%xmm0
+ vmovdqu %xmm0,(%rdi)
+ ret
+
+.cfi_endproc
+.size gcm_gmult_vpclmulqdq_avx2, . - gcm_gmult_vpclmulqdq_avx2
+.globl gcm_ghash_vpclmulqdq_avx2
+.hidden gcm_ghash_vpclmulqdq_avx2
+.type gcm_ghash_vpclmulqdq_avx2,@function
+.align 32
+gcm_ghash_vpclmulqdq_avx2:
+.cfi_startproc
+
+_CET_ENDBR
+
+
+
+ vbroadcasti128 .Lbswap_mask(%rip),%ymm6
+ vmovdqu (%rdi),%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vbroadcasti128 .Lgfpoly(%rip),%ymm7
+
+
+ cmpq $32,%rcx
+ jb .Lghash_lastblock
+
+ cmpq $127,%rcx
+ jbe .Lghash_loop_1x
+
+
+ vmovdqu 128(%rsi),%ymm8
+ vmovdqu 128+32(%rsi),%ymm9
+.Lghash_loop_4x:
+
+ vmovdqu 0(%rdx),%ymm1
+ vpshufb %ymm6,%ymm1,%ymm1
+ vmovdqu 0(%rsi),%ymm2
+ vpxor %ymm5,%ymm1,%ymm1
+ vpclmulqdq $0x00,%ymm2,%ymm1,%ymm3
+ vpclmulqdq $0x11,%ymm2,%ymm1,%ymm5
+ vpunpckhqdq %ymm1,%ymm1,%ymm0
+ vpxor %ymm1,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm8,%ymm0,%ymm4
+
+ vmovdqu 32(%rdx),%ymm1
+ vpshufb %ymm6,%ymm1,%ymm1
+ vmovdqu 32(%rsi),%ymm2
+ vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm5,%ymm5
+ vpunpckhqdq %ymm1,%ymm1,%ymm0
+ vpxor %ymm1,%ymm0,%ymm0
+ vpclmulqdq $0x10,%ymm8,%ymm0,%ymm0
+ vpxor %ymm0,%ymm4,%ymm4
+
+ vmovdqu 64(%rdx),%ymm1
+ vpshufb %ymm6,%ymm1,%ymm1
+ vmovdqu 64(%rsi),%ymm2
+ vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm5,%ymm5
+ vpunpckhqdq %ymm1,%ymm1,%ymm0
+ vpxor %ymm1,%ymm0,%ymm0
+ vpclmulqdq $0x00,%ymm9,%ymm0,%ymm0
+ vpxor %ymm0,%ymm4,%ymm4
+
+
+ vmovdqu 96(%rdx),%ymm1
+ vpshufb %ymm6,%ymm1,%ymm1
+ vmovdqu 96(%rsi),%ymm2
+ vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0
+ vpxor %ymm0,%ymm5,%ymm5
+ vpunpckhqdq %ymm1,%ymm1,%ymm0
+ vpxor %ymm1,%ymm0,%ymm0
+ vpclmulqdq $0x10,%ymm9,%ymm0,%ymm0
+ vpxor %ymm0,%ymm4,%ymm4
+
+ vpxor %ymm3,%ymm4,%ymm4
+ vpxor %ymm5,%ymm4,%ymm4
+
+
+ vbroadcasti128 .Lgfpoly(%rip),%ymm2
+ vpclmulqdq $0x01,%ymm3,%ymm2,%ymm0
+ vpshufd $0x4e,%ymm3,%ymm3
+ vpxor %ymm3,%ymm4,%ymm4
+ vpxor %ymm0,%ymm4,%ymm4
+
+ vpclmulqdq $0x01,%ymm4,%ymm2,%ymm0
+ vpshufd $0x4e,%ymm4,%ymm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm0,%ymm5,%ymm5
+ vextracti128 $1,%ymm5,%xmm0
+ vpxor %xmm0,%xmm5,%xmm5
+
+ subq $-128,%rdx
+ addq $-128,%rcx
+ cmpq $127,%rcx
+ ja .Lghash_loop_4x
+
+
+ cmpq $32,%rcx
+ jb .Lghash_loop_1x_done
+.Lghash_loop_1x:
+ vmovdqu (%rdx),%ymm0
+ vpshufb %ymm6,%ymm0,%ymm0
+ vpxor %ymm0,%ymm5,%ymm5
+ vmovdqu 128-32(%rsi),%ymm0
+ vpclmulqdq $0x00,%ymm0,%ymm5,%ymm1
+ vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2
+ vpclmulqdq $0x10,%ymm0,%ymm5,%ymm3
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x01,%ymm1,%ymm7,%ymm3
+ vpshufd $0x4e,%ymm1,%ymm1
+ vpxor %ymm1,%ymm2,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x11,%ymm0,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm2,%ymm7,%ymm1
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm1,%ymm5,%ymm5
+
+ vextracti128 $1,%ymm5,%xmm0
+ vpxor %xmm0,%xmm5,%xmm5
+ addq $32,%rdx
+ subq $32,%rcx
+ cmpq $32,%rcx
+ jae .Lghash_loop_1x
+.Lghash_loop_1x_done:
+
+
+ vzeroupper
+
+
+.Lghash_lastblock:
+ testq %rcx,%rcx
+ jz .Lghash_done
+ vmovdqu (%rdx),%xmm0
+ vpshufb %xmm6,%xmm0,%xmm0
+ vpxor %xmm0,%xmm5,%xmm5
+ vmovdqu 128-16(%rsi),%xmm0
+ vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1
+ vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2
+ vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3
+ vpxor %xmm3,%xmm2,%xmm2
+ vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3
+ vpshufd $0x4e,%xmm1,%xmm1
+ vpxor %xmm1,%xmm2,%xmm2
+ vpxor %xmm3,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5
+ vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1
+ vpshufd $0x4e,%xmm2,%xmm2
+ vpxor %xmm2,%xmm5,%xmm5
+ vpxor %xmm1,%xmm5,%xmm5
+
+
+.Lghash_done:
+
+ vpshufb %xmm6,%xmm5,%xmm5
+ vmovdqu %xmm5,(%rdi)
+ ret
+
+.cfi_endproc
+.size gcm_ghash_vpclmulqdq_avx2, . - gcm_ghash_vpclmulqdq_avx2
+.globl aes_gcm_enc_update_vaes_avx2
+.hidden aes_gcm_enc_update_vaes_avx2
+.type aes_gcm_enc_update_vaes_avx2,@function
+.align 32
+aes_gcm_enc_update_vaes_avx2:
+.cfi_startproc
+
+_CET_ENDBR
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+
+ movq 16(%rsp),%r12
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+ movb $1,BORINGSSL_function_hit+8(%rip)
+#endif
+ vbroadcasti128 .Lbswap_mask(%rip),%ymm0
+
+
+
+ vmovdqu (%r12),%xmm1
+ vpshufb %xmm0,%xmm1,%xmm1
+ vbroadcasti128 (%r8),%ymm11
+ vpshufb %ymm0,%ymm11,%ymm11
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti128 (%rcx),%ymm9
+ vbroadcasti128 (%r11),%ymm10
+
+
+ vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+ cmpq $127,%rdx
+ jbe .Lcrypt_loop_4x_done__func1
+
+ vmovdqu 128(%r9),%ymm7
+ vmovdqu 128+32(%r9),%ymm8
+
+
+
+ vmovdqu .Linc_2blocks(%rip),%ymm2
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm14
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm15
+ vpaddd %ymm2,%ymm11,%ymm11
+
+
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ vpxor %ymm9,%ymm14,%ymm14
+ vpxor %ymm9,%ymm15,%ymm15
+
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_first_4_vecs__func1:
+ vbroadcasti128 (%rax),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_first_4_vecs__func1
+ vpxor 0(%rdi),%ymm10,%ymm2
+ vpxor 32(%rdi),%ymm10,%ymm3
+ vpxor 64(%rdi),%ymm10,%ymm5
+ vpxor 96(%rdi),%ymm10,%ymm6
+ vaesenclast %ymm2,%ymm12,%ymm12
+ vaesenclast %ymm3,%ymm13,%ymm13
+ vaesenclast %ymm5,%ymm14,%ymm14
+ vaesenclast %ymm6,%ymm15,%ymm15
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %ymm13,32(%rsi)
+ vmovdqu %ymm14,64(%rsi)
+ vmovdqu %ymm15,96(%rsi)
+
+ subq $-128,%rdi
+ addq $-128,%rdx
+ cmpq $127,%rdx
+ jbe .Lghash_last_ciphertext_4x__func1
+.align 16
+.Lcrypt_loop_4x__func1:
+
+
+
+
+ vmovdqu .Linc_2blocks(%rip),%ymm2
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm14
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm15
+ vpaddd %ymm2,%ymm11,%ymm11
+
+
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ vpxor %ymm9,%ymm14,%ymm14
+ vpxor %ymm9,%ymm15,%ymm15
+
+ cmpl $24,%r10d
+ jl .Laes128__func1
+ je .Laes192__func1
+
+ vbroadcasti128 -208(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vbroadcasti128 -192(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+.Laes192__func1:
+ vbroadcasti128 -176(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vbroadcasti128 -160(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+.Laes128__func1:
+
+ vmovdqu 0(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 0(%r9),%ymm4
+ vpxor %ymm1,%ymm3,%ymm3
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6
+
+ vbroadcasti128 -144(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vbroadcasti128 -128(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vmovdqu 32(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 32(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -112(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vmovdqu 64(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 64(%r9),%ymm4
+
+ vbroadcasti128 -96(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+
+ vbroadcasti128 -80(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+
+ vmovdqu 96(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+
+ vbroadcasti128 -64(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vmovdqu 96(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -48(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm1,%ymm6,%ymm6
+
+
+ vbroadcasti128 .Lgfpoly(%rip),%ymm4
+ vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -32(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ vpxor %ymm6,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+
+ vbroadcasti128 -16(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vextracti128 $1,%ymm1,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+
+
+ subq $-128,%rsi
+ vpxor 0(%rdi),%ymm10,%ymm2
+ vpxor 32(%rdi),%ymm10,%ymm3
+ vpxor 64(%rdi),%ymm10,%ymm5
+ vpxor 96(%rdi),%ymm10,%ymm6
+ vaesenclast %ymm2,%ymm12,%ymm12
+ vaesenclast %ymm3,%ymm13,%ymm13
+ vaesenclast %ymm5,%ymm14,%ymm14
+ vaesenclast %ymm6,%ymm15,%ymm15
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %ymm13,32(%rsi)
+ vmovdqu %ymm14,64(%rsi)
+ vmovdqu %ymm15,96(%rsi)
+
+ subq $-128,%rdi
+
+ addq $-128,%rdx
+ cmpq $127,%rdx
+ ja .Lcrypt_loop_4x__func1
+.Lghash_last_ciphertext_4x__func1:
+
+ vmovdqu 0(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 0(%r9),%ymm4
+ vpxor %ymm1,%ymm3,%ymm3
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6
+
+ vmovdqu 32(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 32(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vmovdqu 64(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 64(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+
+ vmovdqu 96(%rsi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 96(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm1,%ymm6,%ymm6
+
+
+ vbroadcasti128 .Lgfpoly(%rip),%ymm4
+ vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ vpxor %ymm6,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+ vextracti128 $1,%ymm1,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+
+ subq $-128,%rsi
+.Lcrypt_loop_4x_done__func1:
+
+ testq %rdx,%rdx
+ jz .Ldone__func1
+
+
+
+
+
+ leaq 128(%r9),%r8
+ subq %rdx,%r8
+
+
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+ vpxor %xmm7,%xmm7,%xmm7
+
+ cmpq $64,%rdx
+ jb .Llessthan64bytes__func1
+
+
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_1__func1:
+ vbroadcasti128 (%rax),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_1__func1
+ vaesenclast %ymm10,%ymm12,%ymm12
+ vaesenclast %ymm10,%ymm13,%ymm13
+
+
+ vmovdqu 0(%rdi),%ymm2
+ vmovdqu 32(%rdi),%ymm3
+ vpxor %ymm2,%ymm12,%ymm12
+ vpxor %ymm3,%ymm13,%ymm13
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %ymm13,32(%rsi)
+
+
+ vpshufb %ymm0,%ymm12,%ymm12
+ vpshufb %ymm0,%ymm13,%ymm13
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ vmovdqu 32(%r8),%ymm3
+ vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5
+ vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6
+ vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7
+ vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+
+ addq $64,%r8
+ addq $64,%rdi
+ addq $64,%rsi
+ subq $64,%rdx
+ jz .Lreduce__func1
+
+ vpxor %xmm1,%xmm1,%xmm1
+
+
+.Llessthan64bytes__func1:
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_2__func1:
+ vbroadcasti128 (%rax),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_2__func1
+ vaesenclast %ymm10,%ymm12,%ymm12
+ vaesenclast %ymm10,%ymm13,%ymm13
+
+
+
+
+ cmpq $32,%rdx
+ jb .Lxor_one_block__func1
+ je .Lxor_two_blocks__func1
+
+.Lxor_three_blocks__func1:
+ vmovdqu 0(%rdi),%ymm2
+ vmovdqu 32(%rdi),%xmm3
+ vpxor %ymm2,%ymm12,%ymm12
+ vpxor %xmm3,%xmm13,%xmm13
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %xmm13,32(%rsi)
+
+ vpshufb %ymm0,%ymm12,%ymm12
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ vmovdqu 32(%r8),%xmm3
+ vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm7,%ymm7
+ jmp .Lghash_mul_one_vec_unreduced__func1
+
+.Lxor_two_blocks__func1:
+ vmovdqu (%rdi),%ymm2
+ vpxor %ymm2,%ymm12,%ymm12
+ vmovdqu %ymm12,(%rsi)
+ vpshufb %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ jmp .Lghash_mul_one_vec_unreduced__func1
+
+.Lxor_one_block__func1:
+ vmovdqu (%rdi),%xmm2
+ vpxor %xmm2,%xmm12,%xmm12
+ vmovdqu %xmm12,(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm1,%xmm12,%xmm12
+ vmovdqu (%r8),%xmm2
+
+.Lghash_mul_one_vec_unreduced__func1:
+ vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+
+.Lreduce__func1:
+
+ vbroadcasti128 .Lgfpoly(%rip),%ymm2
+ vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3
+ vpshufd $0x4e,%ymm6,%ymm6
+ vpxor %ymm6,%ymm7,%ymm7
+ vpxor %ymm3,%ymm7,%ymm7
+ vextracti128 $1,%ymm7,%xmm1
+ vpxor %xmm7,%xmm1,%xmm1
+
+.Ldone__func1:
+
+ vpshufb %xmm0,%xmm1,%xmm1
+ vmovdqu %xmm1,(%r12)
+
+ vzeroupper
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ ret
+
+.cfi_endproc
+.size aes_gcm_enc_update_vaes_avx2, . - aes_gcm_enc_update_vaes_avx2
+.globl aes_gcm_dec_update_vaes_avx2
+.hidden aes_gcm_dec_update_vaes_avx2
+.type aes_gcm_dec_update_vaes_avx2,@function
+.align 32
+aes_gcm_dec_update_vaes_avx2:
+.cfi_startproc
+
+_CET_ENDBR
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+
+ movq 16(%rsp),%r12
+ vbroadcasti128 .Lbswap_mask(%rip),%ymm0
+
+
+
+ vmovdqu (%r12),%xmm1
+ vpshufb %xmm0,%xmm1,%xmm1
+ vbroadcasti128 (%r8),%ymm11
+ vpshufb %ymm0,%ymm11,%ymm11
+
+
+
+ movl 240(%rcx),%r10d
+ leal -20(,%r10,4),%r10d
+
+
+
+
+ leaq 96(%rcx,%r10,4),%r11
+ vbroadcasti128 (%rcx),%ymm9
+ vbroadcasti128 (%r11),%ymm10
+
+
+ vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11
+
+
+
+ cmpq $127,%rdx
+ jbe .Lcrypt_loop_4x_done__func2
+
+ vmovdqu 128(%r9),%ymm7
+ vmovdqu 128+32(%r9),%ymm8
+.align 16
+.Lcrypt_loop_4x__func2:
+
+
+
+
+ vmovdqu .Linc_2blocks(%rip),%ymm2
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm14
+ vpaddd %ymm2,%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm15
+ vpaddd %ymm2,%ymm11,%ymm11
+
+
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ vpxor %ymm9,%ymm14,%ymm14
+ vpxor %ymm9,%ymm15,%ymm15
+
+ cmpl $24,%r10d
+ jl .Laes128__func2
+ je .Laes192__func2
+
+ vbroadcasti128 -208(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vbroadcasti128 -192(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+.Laes192__func2:
+ vbroadcasti128 -176(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vbroadcasti128 -160(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+.Laes128__func2:
+
+ vmovdqu 0(%rdi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 0(%r9),%ymm4
+ vpxor %ymm1,%ymm3,%ymm3
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6
+
+ vbroadcasti128 -144(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vbroadcasti128 -128(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vmovdqu 32(%rdi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 32(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -112(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vmovdqu 64(%rdi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+ vmovdqu 64(%r9),%ymm4
+
+ vbroadcasti128 -96(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+
+ vbroadcasti128 -80(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+
+ vmovdqu 96(%rdi),%ymm3
+ vpshufb %ymm0,%ymm3,%ymm3
+
+ vbroadcasti128 -64(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vmovdqu 96(%r9),%ymm4
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm5,%ymm5
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vpunpckhqdq %ymm3,%ymm3,%ymm2
+ vpxor %ymm3,%ymm2,%ymm2
+ vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -48(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm1,%ymm6,%ymm6
+
+
+ vbroadcasti128 .Lgfpoly(%rip),%ymm4
+ vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm2,%ymm6,%ymm6
+
+ vbroadcasti128 -32(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+
+ vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ vpxor %ymm6,%ymm1,%ymm1
+ vpxor %ymm2,%ymm1,%ymm1
+
+ vbroadcasti128 -16(%r11),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ vaesenc %ymm2,%ymm14,%ymm14
+ vaesenc %ymm2,%ymm15,%ymm15
+
+ vextracti128 $1,%ymm1,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+
+
+
+ vpxor 0(%rdi),%ymm10,%ymm2
+ vpxor 32(%rdi),%ymm10,%ymm3
+ vpxor 64(%rdi),%ymm10,%ymm5
+ vpxor 96(%rdi),%ymm10,%ymm6
+ vaesenclast %ymm2,%ymm12,%ymm12
+ vaesenclast %ymm3,%ymm13,%ymm13
+ vaesenclast %ymm5,%ymm14,%ymm14
+ vaesenclast %ymm6,%ymm15,%ymm15
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %ymm13,32(%rsi)
+ vmovdqu %ymm14,64(%rsi)
+ vmovdqu %ymm15,96(%rsi)
+
+ subq $-128,%rdi
+ subq $-128,%rsi
+ addq $-128,%rdx
+ cmpq $127,%rdx
+ ja .Lcrypt_loop_4x__func2
+.Lcrypt_loop_4x_done__func2:
+
+ testq %rdx,%rdx
+ jz .Ldone__func2
+
+
+
+
+
+ leaq 128(%r9),%r8
+ subq %rdx,%r8
+
+
+ vpxor %xmm5,%xmm5,%xmm5
+ vpxor %xmm6,%xmm6,%xmm6
+ vpxor %xmm7,%xmm7,%xmm7
+
+ cmpq $64,%rdx
+ jb .Llessthan64bytes__func2
+
+
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_1__func2:
+ vbroadcasti128 (%rax),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_1__func2
+ vaesenclast %ymm10,%ymm12,%ymm12
+ vaesenclast %ymm10,%ymm13,%ymm13
+
+
+ vmovdqu 0(%rdi),%ymm2
+ vmovdqu 32(%rdi),%ymm3
+ vpxor %ymm2,%ymm12,%ymm12
+ vpxor %ymm3,%ymm13,%ymm13
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %ymm13,32(%rsi)
+
+
+ vpshufb %ymm0,%ymm2,%ymm12
+ vpshufb %ymm0,%ymm3,%ymm13
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ vmovdqu 32(%r8),%ymm3
+ vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5
+ vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6
+ vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7
+ vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+
+ addq $64,%r8
+ addq $64,%rdi
+ addq $64,%rsi
+ subq $64,%rdx
+ jz .Lreduce__func2
+
+ vpxor %xmm1,%xmm1,%xmm1
+
+
+.Llessthan64bytes__func2:
+ vpshufb %ymm0,%ymm11,%ymm12
+ vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11
+ vpshufb %ymm0,%ymm11,%ymm13
+ vpxor %ymm9,%ymm12,%ymm12
+ vpxor %ymm9,%ymm13,%ymm13
+ leaq 16(%rcx),%rax
+.Lvaesenc_loop_tail_2__func2:
+ vbroadcasti128 (%rax),%ymm2
+ vaesenc %ymm2,%ymm12,%ymm12
+ vaesenc %ymm2,%ymm13,%ymm13
+ addq $16,%rax
+ cmpq %rax,%r11
+ jne .Lvaesenc_loop_tail_2__func2
+ vaesenclast %ymm10,%ymm12,%ymm12
+ vaesenclast %ymm10,%ymm13,%ymm13
+
+
+
+
+ cmpq $32,%rdx
+ jb .Lxor_one_block__func2
+ je .Lxor_two_blocks__func2
+
+.Lxor_three_blocks__func2:
+ vmovdqu 0(%rdi),%ymm2
+ vmovdqu 32(%rdi),%xmm3
+ vpxor %ymm2,%ymm12,%ymm12
+ vpxor %xmm3,%xmm13,%xmm13
+ vmovdqu %ymm12,0(%rsi)
+ vmovdqu %xmm13,32(%rsi)
+
+ vpshufb %ymm0,%ymm2,%ymm12
+ vpshufb %xmm0,%xmm3,%xmm13
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ vmovdqu 32(%r8),%xmm3
+ vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4
+ vpxor %ymm4,%ymm7,%ymm7
+ jmp .Lghash_mul_one_vec_unreduced__func2
+
+.Lxor_two_blocks__func2:
+ vmovdqu (%rdi),%ymm2
+ vpxor %ymm2,%ymm12,%ymm12
+ vmovdqu %ymm12,(%rsi)
+ vpshufb %ymm0,%ymm2,%ymm12
+ vpxor %ymm1,%ymm12,%ymm12
+ vmovdqu (%r8),%ymm2
+ jmp .Lghash_mul_one_vec_unreduced__func2
+
+.Lxor_one_block__func2:
+ vmovdqu (%rdi),%xmm2
+ vpxor %xmm2,%xmm12,%xmm12
+ vmovdqu %xmm12,(%rsi)
+ vpshufb %xmm0,%xmm2,%xmm12
+ vpxor %xmm1,%xmm12,%xmm12
+ vmovdqu (%r8),%xmm2
+
+.Lghash_mul_one_vec_unreduced__func2:
+ vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm5,%ymm5
+ vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm6,%ymm6
+ vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+
+.Lreduce__func2:
+
+ vbroadcasti128 .Lgfpoly(%rip),%ymm2
+ vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3
+ vpshufd $0x4e,%ymm5,%ymm5
+ vpxor %ymm5,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3
+ vpshufd $0x4e,%ymm6,%ymm6
+ vpxor %ymm6,%ymm7,%ymm7
+ vpxor %ymm3,%ymm7,%ymm7
+ vextracti128 $1,%ymm7,%xmm1
+ vpxor %xmm7,%xmm1,%xmm1
+
+.Ldone__func2:
+
+ vpshufb %xmm0,%xmm1,%xmm1
+ vmovdqu %xmm1,(%r12)
+
+ vzeroupper
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ ret
+
+.cfi_endproc
+.size aes_gcm_dec_update_vaes_avx2, . - aes_gcm_dec_update_vaes_avx2
+#endif
diff --git a/gen/bcm/aes-gcm-avx2-x86_64-win.asm b/gen/bcm/aes-gcm-avx2-x86_64-win.asm
new file mode 100644
index 0000000..9201553
--- /dev/null
+++ b/gen/bcm/aes-gcm-avx2-x86_64-win.asm
@@ -0,0 +1,1588 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .rdata rdata align=8
+ALIGN 16
+
+
+$L$bswap_mask:
+ DQ 0x08090a0b0c0d0e0f,0x0001020304050607
+
+
+
+
+
+
+
+
+$L$gfpoly:
+ DQ 1,0xc200000000000000
+
+
+$L$gfpoly_and_internal_carrybit:
+ DQ 1,0xc200000000000001
+
+ALIGN 32
+
+$L$ctr_pattern:
+ DQ 0,0
+ DQ 1,0
+$L$inc_2blocks:
+ DQ 2,0
+ DQ 2,0
+
+section .text code align=64
+
+global gcm_init_vpclmulqdq_avx2
+
+ALIGN 32
+gcm_init_vpclmulqdq_avx2:
+
+$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1:
+_CET_ENDBR
+ sub rsp,24
+$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3:
+
+$L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4:
+
+
+
+ vpshufd xmm3,XMMWORD[rdx],0x4e
+
+
+
+
+
+ vpshufd xmm0,xmm3,0xd3
+ vpsrad xmm0,xmm0,31
+ vpaddq xmm3,xmm3,xmm3
+ vpand xmm0,xmm0,XMMWORD[$L$gfpoly_and_internal_carrybit]
+ vpxor xmm3,xmm3,xmm0
+
+ vbroadcasti128 ymm6,XMMWORD[$L$gfpoly]
+
+
+ vpclmulqdq xmm0,xmm3,xmm3,0x00
+ vpclmulqdq xmm1,xmm3,xmm3,0x01
+ vpclmulqdq xmm2,xmm3,xmm3,0x10
+ vpxor xmm1,xmm1,xmm2
+ vpclmulqdq xmm2,xmm6,xmm0,0x01
+ vpshufd xmm0,xmm0,0x4e
+ vpxor xmm1,xmm1,xmm0
+ vpxor xmm1,xmm1,xmm2
+ vpclmulqdq xmm5,xmm3,xmm3,0x11
+ vpclmulqdq xmm0,xmm6,xmm1,0x01
+ vpshufd xmm1,xmm1,0x4e
+ vpxor xmm5,xmm5,xmm1
+ vpxor xmm5,xmm5,xmm0
+
+
+
+ vinserti128 ymm3,ymm5,xmm3,1
+ vinserti128 ymm5,ymm5,xmm5,1
+
+
+ vpclmulqdq ymm0,ymm3,ymm5,0x00
+ vpclmulqdq ymm1,ymm3,ymm5,0x01
+ vpclmulqdq ymm2,ymm3,ymm5,0x10
+ vpxor ymm1,ymm1,ymm2
+ vpclmulqdq ymm2,ymm6,ymm0,0x01
+ vpshufd ymm0,ymm0,0x4e
+ vpxor ymm1,ymm1,ymm0
+ vpxor ymm1,ymm1,ymm2
+ vpclmulqdq ymm4,ymm3,ymm5,0x11
+ vpclmulqdq ymm0,ymm6,ymm1,0x01
+ vpshufd ymm1,ymm1,0x4e
+ vpxor ymm4,ymm4,ymm1
+ vpxor ymm4,ymm4,ymm0
+
+
+
+ vmovdqu YMMWORD[96+rcx],ymm3
+ vmovdqu YMMWORD[64+rcx],ymm4
+
+
+
+ vpunpcklqdq ymm0,ymm4,ymm3
+ vpunpckhqdq ymm1,ymm4,ymm3
+ vpxor ymm0,ymm0,ymm1
+ vmovdqu YMMWORD[(128+32)+rcx],ymm0
+
+
+ vpclmulqdq ymm0,ymm4,ymm5,0x00
+ vpclmulqdq ymm1,ymm4,ymm5,0x01
+ vpclmulqdq ymm2,ymm4,ymm5,0x10
+ vpxor ymm1,ymm1,ymm2
+ vpclmulqdq ymm2,ymm6,ymm0,0x01
+ vpshufd ymm0,ymm0,0x4e
+ vpxor ymm1,ymm1,ymm0
+ vpxor ymm1,ymm1,ymm2
+ vpclmulqdq ymm3,ymm4,ymm5,0x11
+ vpclmulqdq ymm0,ymm6,ymm1,0x01
+ vpshufd ymm1,ymm1,0x4e
+ vpxor ymm3,ymm3,ymm1
+ vpxor ymm3,ymm3,ymm0
+
+ vpclmulqdq ymm0,ymm3,ymm5,0x00
+ vpclmulqdq ymm1,ymm3,ymm5,0x01
+ vpclmulqdq ymm2,ymm3,ymm5,0x10
+ vpxor ymm1,ymm1,ymm2
+ vpclmulqdq ymm2,ymm6,ymm0,0x01
+ vpshufd ymm0,ymm0,0x4e
+ vpxor ymm1,ymm1,ymm0
+ vpxor ymm1,ymm1,ymm2
+ vpclmulqdq ymm4,ymm3,ymm5,0x11
+ vpclmulqdq ymm0,ymm6,ymm1,0x01
+ vpshufd ymm1,ymm1,0x4e
+ vpxor ymm4,ymm4,ymm1
+ vpxor ymm4,ymm4,ymm0
+
+ vmovdqu YMMWORD[32+rcx],ymm3
+ vmovdqu YMMWORD[rcx],ymm4
+
+
+
+ vpunpcklqdq ymm0,ymm4,ymm3
+ vpunpckhqdq ymm1,ymm4,ymm3
+ vpxor ymm0,ymm0,ymm1
+ vmovdqu YMMWORD[128+rcx],ymm0
+
+ vzeroupper
+ movdqa xmm6,XMMWORD[rsp]
+ add rsp,24
+ ret
+$L$SEH_end_gcm_init_vpclmulqdq_avx2_5:
+
+
+global gcm_gmult_vpclmulqdq_avx2
+
+ALIGN 32
+gcm_gmult_vpclmulqdq_avx2:
+
+$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1:
+_CET_ENDBR
+ sub rsp,24
+$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_2:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_3:
+
+$L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx2_4:
+
+ vmovdqu xmm0,XMMWORD[rcx]
+ vmovdqu xmm1,XMMWORD[$L$bswap_mask]
+ vmovdqu xmm2,XMMWORD[((128-16))+rdx]
+ vmovdqu xmm3,XMMWORD[$L$gfpoly]
+ vpshufb xmm0,xmm0,xmm1
+
+ vpclmulqdq xmm4,xmm0,xmm2,0x00
+ vpclmulqdq xmm5,xmm0,xmm2,0x01
+ vpclmulqdq xmm6,xmm0,xmm2,0x10
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm3,xmm4,0x01
+ vpshufd xmm4,xmm4,0x4e
+ vpxor xmm5,xmm5,xmm4
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm0,xmm0,xmm2,0x11
+ vpclmulqdq xmm4,xmm3,xmm5,0x01
+ vpshufd xmm5,xmm5,0x4e
+ vpxor xmm0,xmm0,xmm5
+ vpxor xmm0,xmm0,xmm4
+
+
+ vpshufb xmm0,xmm0,xmm1
+ vmovdqu XMMWORD[rcx],xmm0
+ movdqa xmm6,XMMWORD[rsp]
+ add rsp,24
+ ret
+$L$SEH_end_gcm_gmult_vpclmulqdq_avx2_5:
+
+
+global gcm_ghash_vpclmulqdq_avx2
+
+ALIGN 32
+gcm_ghash_vpclmulqdq_avx2:
+
+$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1:
+_CET_ENDBR
+ sub rsp,72
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_2:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_3:
+ movdqa XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_4:
+ movdqa XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_5:
+ movdqa XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_6:
+
+$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_7:
+
+ vbroadcasti128 ymm6,XMMWORD[$L$bswap_mask]
+ vmovdqu xmm5,XMMWORD[rcx]
+ vpshufb xmm5,xmm5,xmm6
+ vbroadcasti128 ymm7,XMMWORD[$L$gfpoly]
+
+
+ cmp r9,32
+ jb NEAR $L$ghash_lastblock
+
+ cmp r9,127
+ jbe NEAR $L$ghash_loop_1x
+
+
+ vmovdqu ymm8,YMMWORD[128+rdx]
+ vmovdqu ymm9,YMMWORD[((128+32))+rdx]
+$L$ghash_loop_4x:
+
+ vmovdqu ymm1,YMMWORD[r8]
+ vpshufb ymm1,ymm1,ymm6
+ vmovdqu ymm2,YMMWORD[rdx]
+ vpxor ymm1,ymm1,ymm5
+ vpclmulqdq ymm3,ymm1,ymm2,0x00
+ vpclmulqdq ymm5,ymm1,ymm2,0x11
+ vpunpckhqdq ymm0,ymm1,ymm1
+ vpxor ymm0,ymm0,ymm1
+ vpclmulqdq ymm4,ymm0,ymm8,0x00
+
+ vmovdqu ymm1,YMMWORD[32+r8]
+ vpshufb ymm1,ymm1,ymm6
+ vmovdqu ymm2,YMMWORD[32+rdx]
+ vpclmulqdq ymm0,ymm1,ymm2,0x00
+ vpxor ymm3,ymm3,ymm0
+ vpclmulqdq ymm0,ymm1,ymm2,0x11
+ vpxor ymm5,ymm5,ymm0
+ vpunpckhqdq ymm0,ymm1,ymm1
+ vpxor ymm0,ymm0,ymm1
+ vpclmulqdq ymm0,ymm0,ymm8,0x10
+ vpxor ymm4,ymm4,ymm0
+
+ vmovdqu ymm1,YMMWORD[64+r8]
+ vpshufb ymm1,ymm1,ymm6
+ vmovdqu ymm2,YMMWORD[64+rdx]
+ vpclmulqdq ymm0,ymm1,ymm2,0x00
+ vpxor ymm3,ymm3,ymm0
+ vpclmulqdq ymm0,ymm1,ymm2,0x11
+ vpxor ymm5,ymm5,ymm0
+ vpunpckhqdq ymm0,ymm1,ymm1
+ vpxor ymm0,ymm0,ymm1
+ vpclmulqdq ymm0,ymm0,ymm9,0x00
+ vpxor ymm4,ymm4,ymm0
+
+
+ vmovdqu ymm1,YMMWORD[96+r8]
+ vpshufb ymm1,ymm1,ymm6
+ vmovdqu ymm2,YMMWORD[96+rdx]
+ vpclmulqdq ymm0,ymm1,ymm2,0x00
+ vpxor ymm3,ymm3,ymm0
+ vpclmulqdq ymm0,ymm1,ymm2,0x11
+ vpxor ymm5,ymm5,ymm0
+ vpunpckhqdq ymm0,ymm1,ymm1
+ vpxor ymm0,ymm0,ymm1
+ vpclmulqdq ymm0,ymm0,ymm9,0x10
+ vpxor ymm4,ymm4,ymm0
+
+ vpxor ymm4,ymm4,ymm3
+ vpxor ymm4,ymm4,ymm5
+
+
+ vbroadcasti128 ymm2,XMMWORD[$L$gfpoly]
+ vpclmulqdq ymm0,ymm2,ymm3,0x01
+ vpshufd ymm3,ymm3,0x4e
+ vpxor ymm4,ymm4,ymm3
+ vpxor ymm4,ymm4,ymm0
+
+ vpclmulqdq ymm0,ymm2,ymm4,0x01
+ vpshufd ymm4,ymm4,0x4e
+ vpxor ymm5,ymm5,ymm4
+ vpxor ymm5,ymm5,ymm0
+ vextracti128 xmm0,ymm5,1
+ vpxor xmm5,xmm5,xmm0
+
+ sub r8,-128
+ add r9,-128
+ cmp r9,127
+ ja NEAR $L$ghash_loop_4x
+
+
+ cmp r9,32
+ jb NEAR $L$ghash_loop_1x_done
+$L$ghash_loop_1x:
+ vmovdqu ymm0,YMMWORD[r8]
+ vpshufb ymm0,ymm0,ymm6
+ vpxor ymm5,ymm5,ymm0
+ vmovdqu ymm0,YMMWORD[((128-32))+rdx]
+ vpclmulqdq ymm1,ymm5,ymm0,0x00
+ vpclmulqdq ymm2,ymm5,ymm0,0x01
+ vpclmulqdq ymm3,ymm5,ymm0,0x10
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm3,ymm7,ymm1,0x01
+ vpshufd ymm1,ymm1,0x4e
+ vpxor ymm2,ymm2,ymm1
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm5,ymm5,ymm0,0x11
+ vpclmulqdq ymm1,ymm7,ymm2,0x01
+ vpshufd ymm2,ymm2,0x4e
+ vpxor ymm5,ymm5,ymm2
+ vpxor ymm5,ymm5,ymm1
+
+ vextracti128 xmm0,ymm5,1
+ vpxor xmm5,xmm5,xmm0
+ add r8,32
+ sub r9,32
+ cmp r9,32
+ jae NEAR $L$ghash_loop_1x
+$L$ghash_loop_1x_done:
+
+
+ vzeroupper
+
+
+$L$ghash_lastblock:
+ test r9,r9
+ jz NEAR $L$ghash_done
+ vmovdqu xmm0,XMMWORD[r8]
+ vpshufb xmm0,xmm0,xmm6
+ vpxor xmm5,xmm5,xmm0
+ vmovdqu xmm0,XMMWORD[((128-16))+rdx]
+ vpclmulqdq xmm1,xmm5,xmm0,0x00
+ vpclmulqdq xmm2,xmm5,xmm0,0x01
+ vpclmulqdq xmm3,xmm5,xmm0,0x10
+ vpxor xmm2,xmm2,xmm3
+ vpclmulqdq xmm3,xmm7,xmm1,0x01
+ vpshufd xmm1,xmm1,0x4e
+ vpxor xmm2,xmm2,xmm1
+ vpxor xmm2,xmm2,xmm3
+ vpclmulqdq xmm5,xmm5,xmm0,0x11
+ vpclmulqdq xmm1,xmm7,xmm2,0x01
+ vpshufd xmm2,xmm2,0x4e
+ vpxor xmm5,xmm5,xmm2
+ vpxor xmm5,xmm5,xmm1
+
+
+$L$ghash_done:
+
+ vpshufb xmm5,xmm5,xmm6
+ vmovdqu XMMWORD[rcx],xmm5
+ movdqa xmm6,XMMWORD[rsp]
+ movdqa xmm7,XMMWORD[16+rsp]
+ movdqa xmm8,XMMWORD[32+rsp]
+ movdqa xmm9,XMMWORD[48+rsp]
+ add rsp,72
+ ret
+$L$SEH_end_gcm_ghash_vpclmulqdq_avx2_8:
+
+
+global aes_gcm_enc_update_vaes_avx2
+
+ALIGN 32
+aes_gcm_enc_update_vaes_avx2:
+
+$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1:
+_CET_ENDBR
+ push rsi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2:
+ push rdi
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3:
+ push r12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4:
+
+ mov rsi,QWORD[64+rsp]
+ mov rdi,QWORD[72+rsp]
+ mov r12,QWORD[80+rsp]
+ sub rsp,160
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6:
+ movdqa XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7:
+ movdqa XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8:
+ movdqa XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9:
+ movdqa XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10:
+ movdqa XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11:
+ movdqa XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12:
+ movdqa XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13:
+ movdqa XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14:
+ movdqa XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15:
+
+$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16:
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN BORINGSSL_function_hit
+ mov BYTE[((BORINGSSL_function_hit+8))],1
+%endif
+ vbroadcasti128 ymm0,XMMWORD[$L$bswap_mask]
+
+
+
+ vmovdqu xmm1,XMMWORD[r12]
+ vpshufb xmm1,xmm1,xmm0
+ vbroadcasti128 ymm11,XMMWORD[rsi]
+ vpshufb ymm11,ymm11,ymm0
+
+
+
+ mov r10d,DWORD[240+r9]
+ lea r10d,[((-20))+r10*4]
+
+
+
+
+ lea r11,[96+r10*4+r9]
+ vbroadcasti128 ymm9,XMMWORD[r9]
+ vbroadcasti128 ymm10,XMMWORD[r11]
+
+
+ vpaddd ymm11,ymm11,YMMWORD[$L$ctr_pattern]
+
+
+
+ cmp r8,127
+ jbe NEAR $L$crypt_loop_4x_done__func1
+
+ vmovdqu ymm7,YMMWORD[128+rdi]
+ vmovdqu ymm8,YMMWORD[((128+32))+rdi]
+
+
+
+ vmovdqu ymm2,YMMWORD[$L$inc_2blocks]
+ vpshufb ymm12,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+ vpshufb ymm13,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+ vpshufb ymm14,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+ vpshufb ymm15,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+
+
+ vpxor ymm12,ymm12,ymm9
+ vpxor ymm13,ymm13,ymm9
+ vpxor ymm14,ymm14,ymm9
+ vpxor ymm15,ymm15,ymm9
+
+ lea rax,[16+r9]
+$L$vaesenc_loop_first_4_vecs__func1:
+ vbroadcasti128 ymm2,XMMWORD[rax]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_first_4_vecs__func1
+ vpxor ymm2,ymm10,YMMWORD[rcx]
+ vpxor ymm3,ymm10,YMMWORD[32+rcx]
+ vpxor ymm5,ymm10,YMMWORD[64+rcx]
+ vpxor ymm6,ymm10,YMMWORD[96+rcx]
+ vaesenclast ymm12,ymm12,ymm2
+ vaesenclast ymm13,ymm13,ymm3
+ vaesenclast ymm14,ymm14,ymm5
+ vaesenclast ymm15,ymm15,ymm6
+ vmovdqu YMMWORD[rdx],ymm12
+ vmovdqu YMMWORD[32+rdx],ymm13
+ vmovdqu YMMWORD[64+rdx],ymm14
+ vmovdqu YMMWORD[96+rdx],ymm15
+
+ sub rcx,-128
+ add r8,-128
+ cmp r8,127
+ jbe NEAR $L$ghash_last_ciphertext_4x__func1
+ALIGN 16
+$L$crypt_loop_4x__func1:
+
+
+
+
+ vmovdqu ymm2,YMMWORD[$L$inc_2blocks]
+ vpshufb ymm12,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+ vpshufb ymm13,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+ vpshufb ymm14,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+ vpshufb ymm15,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+
+
+ vpxor ymm12,ymm12,ymm9
+ vpxor ymm13,ymm13,ymm9
+ vpxor ymm14,ymm14,ymm9
+ vpxor ymm15,ymm15,ymm9
+
+ cmp r10d,24
+ jl NEAR $L$aes128__func1
+ je NEAR $L$aes192__func1
+
+ vbroadcasti128 ymm2,XMMWORD[((-208))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-192))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+$L$aes192__func1:
+ vbroadcasti128 ymm2,XMMWORD[((-176))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-160))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+$L$aes128__func1:
+
+ vmovdqu ymm3,YMMWORD[rdx]
+ vpshufb ymm3,ymm3,ymm0
+ vmovdqu ymm4,YMMWORD[rdi]
+ vpxor ymm3,ymm3,ymm1
+ vpclmulqdq ymm5,ymm3,ymm4,0x00
+ vpclmulqdq ymm1,ymm3,ymm4,0x11
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm6,ymm2,ymm7,0x00
+
+ vbroadcasti128 ymm2,XMMWORD[((-144))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+
+ vbroadcasti128 ymm2,XMMWORD[((-128))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+
+ vmovdqu ymm3,YMMWORD[32+rdx]
+ vpshufb ymm3,ymm3,ymm0
+ vmovdqu ymm4,YMMWORD[32+rdi]
+ vpclmulqdq ymm2,ymm3,ymm4,0x00
+ vpxor ymm5,ymm5,ymm2
+ vpclmulqdq ymm2,ymm3,ymm4,0x11
+ vpxor ymm1,ymm1,ymm2
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm2,ymm2,ymm7,0x10
+ vpxor ymm6,ymm6,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-112))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+
+ vmovdqu ymm3,YMMWORD[64+rdx]
+ vpshufb ymm3,ymm3,ymm0
+ vmovdqu ymm4,YMMWORD[64+rdi]
+
+ vbroadcasti128 ymm2,XMMWORD[((-96))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vpclmulqdq ymm2,ymm3,ymm4,0x00
+ vpxor ymm5,ymm5,ymm2
+ vpclmulqdq ymm2,ymm3,ymm4,0x11
+ vpxor ymm1,ymm1,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-80))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm2,ymm2,ymm8,0x00
+ vpxor ymm6,ymm6,ymm2
+
+
+ vmovdqu ymm3,YMMWORD[96+rdx]
+ vpshufb ymm3,ymm3,ymm0
+
+ vbroadcasti128 ymm2,XMMWORD[((-64))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vmovdqu ymm4,YMMWORD[96+rdi]
+ vpclmulqdq ymm2,ymm3,ymm4,0x00
+ vpxor ymm5,ymm5,ymm2
+ vpclmulqdq ymm2,ymm3,ymm4,0x11
+ vpxor ymm1,ymm1,ymm2
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm2,ymm2,ymm8,0x10
+ vpxor ymm6,ymm6,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-48))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+
+ vpxor ymm6,ymm6,ymm5
+ vpxor ymm6,ymm6,ymm1
+
+
+ vbroadcasti128 ymm4,XMMWORD[$L$gfpoly]
+ vpclmulqdq ymm2,ymm4,ymm5,0x01
+ vpshufd ymm5,ymm5,0x4e
+ vpxor ymm6,ymm6,ymm5
+ vpxor ymm6,ymm6,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-32))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+
+ vpclmulqdq ymm2,ymm4,ymm6,0x01
+ vpshufd ymm6,ymm6,0x4e
+ vpxor ymm1,ymm1,ymm6
+ vpxor ymm1,ymm1,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-16))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vextracti128 xmm2,ymm1,1
+ vpxor xmm1,xmm1,xmm2
+
+
+ sub rdx,-128
+ vpxor ymm2,ymm10,YMMWORD[rcx]
+ vpxor ymm3,ymm10,YMMWORD[32+rcx]
+ vpxor ymm5,ymm10,YMMWORD[64+rcx]
+ vpxor ymm6,ymm10,YMMWORD[96+rcx]
+ vaesenclast ymm12,ymm12,ymm2
+ vaesenclast ymm13,ymm13,ymm3
+ vaesenclast ymm14,ymm14,ymm5
+ vaesenclast ymm15,ymm15,ymm6
+ vmovdqu YMMWORD[rdx],ymm12
+ vmovdqu YMMWORD[32+rdx],ymm13
+ vmovdqu YMMWORD[64+rdx],ymm14
+ vmovdqu YMMWORD[96+rdx],ymm15
+
+ sub rcx,-128
+
+ add r8,-128
+ cmp r8,127
+ ja NEAR $L$crypt_loop_4x__func1
+$L$ghash_last_ciphertext_4x__func1:
+
+ vmovdqu ymm3,YMMWORD[rdx]
+ vpshufb ymm3,ymm3,ymm0
+ vmovdqu ymm4,YMMWORD[rdi]
+ vpxor ymm3,ymm3,ymm1
+ vpclmulqdq ymm5,ymm3,ymm4,0x00
+ vpclmulqdq ymm1,ymm3,ymm4,0x11
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm6,ymm2,ymm7,0x00
+
+ vmovdqu ymm3,YMMWORD[32+rdx]
+ vpshufb ymm3,ymm3,ymm0
+ vmovdqu ymm4,YMMWORD[32+rdi]
+ vpclmulqdq ymm2,ymm3,ymm4,0x00
+ vpxor ymm5,ymm5,ymm2
+ vpclmulqdq ymm2,ymm3,ymm4,0x11
+ vpxor ymm1,ymm1,ymm2
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm2,ymm2,ymm7,0x10
+ vpxor ymm6,ymm6,ymm2
+
+ vmovdqu ymm3,YMMWORD[64+rdx]
+ vpshufb ymm3,ymm3,ymm0
+ vmovdqu ymm4,YMMWORD[64+rdi]
+ vpclmulqdq ymm2,ymm3,ymm4,0x00
+ vpxor ymm5,ymm5,ymm2
+ vpclmulqdq ymm2,ymm3,ymm4,0x11
+ vpxor ymm1,ymm1,ymm2
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm2,ymm2,ymm8,0x00
+ vpxor ymm6,ymm6,ymm2
+
+
+ vmovdqu ymm3,YMMWORD[96+rdx]
+ vpshufb ymm3,ymm3,ymm0
+ vmovdqu ymm4,YMMWORD[96+rdi]
+ vpclmulqdq ymm2,ymm3,ymm4,0x00
+ vpxor ymm5,ymm5,ymm2
+ vpclmulqdq ymm2,ymm3,ymm4,0x11
+ vpxor ymm1,ymm1,ymm2
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm2,ymm2,ymm8,0x10
+ vpxor ymm6,ymm6,ymm2
+
+ vpxor ymm6,ymm6,ymm5
+ vpxor ymm6,ymm6,ymm1
+
+
+ vbroadcasti128 ymm4,XMMWORD[$L$gfpoly]
+ vpclmulqdq ymm2,ymm4,ymm5,0x01
+ vpshufd ymm5,ymm5,0x4e
+ vpxor ymm6,ymm6,ymm5
+ vpxor ymm6,ymm6,ymm2
+
+ vpclmulqdq ymm2,ymm4,ymm6,0x01
+ vpshufd ymm6,ymm6,0x4e
+ vpxor ymm1,ymm1,ymm6
+ vpxor ymm1,ymm1,ymm2
+ vextracti128 xmm2,ymm1,1
+ vpxor xmm1,xmm1,xmm2
+
+ sub rdx,-128
+$L$crypt_loop_4x_done__func1:
+
+ test r8,r8
+ jz NEAR $L$done__func1
+
+
+
+
+
+ lea rsi,[128+rdi]
+ sub rsi,r8
+
+
+ vpxor xmm5,xmm5,xmm5
+ vpxor xmm6,xmm6,xmm6
+ vpxor xmm7,xmm7,xmm7
+
+ cmp r8,64
+ jb NEAR $L$lessthan64bytes__func1
+
+
+ vpshufb ymm12,ymm11,ymm0
+ vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+ vpshufb ymm13,ymm11,ymm0
+ vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+ vpxor ymm12,ymm12,ymm9
+ vpxor ymm13,ymm13,ymm9
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_1__func1:
+ vbroadcasti128 ymm2,XMMWORD[rax]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_1__func1
+ vaesenclast ymm12,ymm12,ymm10
+ vaesenclast ymm13,ymm13,ymm10
+
+
+ vmovdqu ymm2,YMMWORD[rcx]
+ vmovdqu ymm3,YMMWORD[32+rcx]
+ vpxor ymm12,ymm12,ymm2
+ vpxor ymm13,ymm13,ymm3
+ vmovdqu YMMWORD[rdx],ymm12
+ vmovdqu YMMWORD[32+rdx],ymm13
+
+
+ vpshufb ymm12,ymm12,ymm0
+ vpshufb ymm13,ymm13,ymm0
+ vpxor ymm12,ymm12,ymm1
+ vmovdqu ymm2,YMMWORD[rsi]
+ vmovdqu ymm3,YMMWORD[32+rsi]
+ vpclmulqdq ymm5,ymm12,ymm2,0x00
+ vpclmulqdq ymm6,ymm12,ymm2,0x01
+ vpclmulqdq ymm4,ymm12,ymm2,0x10
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq ymm7,ymm12,ymm2,0x11
+ vpclmulqdq ymm4,ymm13,ymm3,0x00
+ vpxor ymm5,ymm5,ymm4
+ vpclmulqdq ymm4,ymm13,ymm3,0x01
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq ymm4,ymm13,ymm3,0x10
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq ymm4,ymm13,ymm3,0x11
+ vpxor ymm7,ymm7,ymm4
+
+ add rsi,64
+ add rcx,64
+ add rdx,64
+ sub r8,64
+ jz NEAR $L$reduce__func1
+
+ vpxor xmm1,xmm1,xmm1
+
+
+$L$lessthan64bytes__func1:
+ vpshufb ymm12,ymm11,ymm0
+ vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+ vpshufb ymm13,ymm11,ymm0
+ vpxor ymm12,ymm12,ymm9
+ vpxor ymm13,ymm13,ymm9
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_2__func1:
+ vbroadcasti128 ymm2,XMMWORD[rax]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_2__func1
+ vaesenclast ymm12,ymm12,ymm10
+ vaesenclast ymm13,ymm13,ymm10
+
+
+
+
+ cmp r8,32
+ jb NEAR $L$xor_one_block__func1
+ je NEAR $L$xor_two_blocks__func1
+
+$L$xor_three_blocks__func1:
+ vmovdqu ymm2,YMMWORD[rcx]
+ vmovdqu xmm3,XMMWORD[32+rcx]
+ vpxor ymm12,ymm12,ymm2
+ vpxor xmm13,xmm13,xmm3
+ vmovdqu YMMWORD[rdx],ymm12
+ vmovdqu XMMWORD[32+rdx],xmm13
+
+ vpshufb ymm12,ymm12,ymm0
+ vpshufb xmm13,xmm13,xmm0
+ vpxor ymm12,ymm12,ymm1
+ vmovdqu ymm2,YMMWORD[rsi]
+ vmovdqu xmm3,XMMWORD[32+rsi]
+ vpclmulqdq xmm4,xmm13,xmm3,0x00
+ vpxor ymm5,ymm5,ymm4
+ vpclmulqdq xmm4,xmm13,xmm3,0x01
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq xmm4,xmm13,xmm3,0x10
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq xmm4,xmm13,xmm3,0x11
+ vpxor ymm7,ymm7,ymm4
+ jmp NEAR $L$ghash_mul_one_vec_unreduced__func1
+
+$L$xor_two_blocks__func1:
+ vmovdqu ymm2,YMMWORD[rcx]
+ vpxor ymm12,ymm12,ymm2
+ vmovdqu YMMWORD[rdx],ymm12
+ vpshufb ymm12,ymm12,ymm0
+ vpxor ymm12,ymm12,ymm1
+ vmovdqu ymm2,YMMWORD[rsi]
+ jmp NEAR $L$ghash_mul_one_vec_unreduced__func1
+
+$L$xor_one_block__func1:
+ vmovdqu xmm2,XMMWORD[rcx]
+ vpxor xmm12,xmm12,xmm2
+ vmovdqu XMMWORD[rdx],xmm12
+ vpshufb xmm12,xmm12,xmm0
+ vpxor xmm12,xmm12,xmm1
+ vmovdqu xmm2,XMMWORD[rsi]
+
+$L$ghash_mul_one_vec_unreduced__func1:
+ vpclmulqdq ymm4,ymm12,ymm2,0x00
+ vpxor ymm5,ymm5,ymm4
+ vpclmulqdq ymm4,ymm12,ymm2,0x01
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq ymm4,ymm12,ymm2,0x10
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq ymm4,ymm12,ymm2,0x11
+ vpxor ymm7,ymm7,ymm4
+
+$L$reduce__func1:
+
+ vbroadcasti128 ymm2,XMMWORD[$L$gfpoly]
+ vpclmulqdq ymm3,ymm2,ymm5,0x01
+ vpshufd ymm5,ymm5,0x4e
+ vpxor ymm6,ymm6,ymm5
+ vpxor ymm6,ymm6,ymm3
+ vpclmulqdq ymm3,ymm2,ymm6,0x01
+ vpshufd ymm6,ymm6,0x4e
+ vpxor ymm7,ymm7,ymm6
+ vpxor ymm7,ymm7,ymm3
+ vextracti128 xmm1,ymm7,1
+ vpxor xmm1,xmm1,xmm7
+
+$L$done__func1:
+
+ vpshufb xmm1,xmm1,xmm0
+ vmovdqu XMMWORD[r12],xmm1
+
+ vzeroupper
+ movdqa xmm6,XMMWORD[rsp]
+ movdqa xmm7,XMMWORD[16+rsp]
+ movdqa xmm8,XMMWORD[32+rsp]
+ movdqa xmm9,XMMWORD[48+rsp]
+ movdqa xmm10,XMMWORD[64+rsp]
+ movdqa xmm11,XMMWORD[80+rsp]
+ movdqa xmm12,XMMWORD[96+rsp]
+ movdqa xmm13,XMMWORD[112+rsp]
+ movdqa xmm14,XMMWORD[128+rsp]
+ movdqa xmm15,XMMWORD[144+rsp]
+ add rsp,160
+ pop r12
+ pop rdi
+ pop rsi
+ ret
+$L$SEH_end_aes_gcm_enc_update_vaes_avx2_17:
+
+
+global aes_gcm_dec_update_vaes_avx2
+
+ALIGN 32
+aes_gcm_dec_update_vaes_avx2:
+
+$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1:
+_CET_ENDBR
+ push rsi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2:
+ push rdi
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3:
+ push r12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4:
+
+ mov rsi,QWORD[64+rsp]
+ mov rdi,QWORD[72+rsp]
+ mov r12,QWORD[80+rsp]
+ sub rsp,160
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6:
+ movdqa XMMWORD[16+rsp],xmm7
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7:
+ movdqa XMMWORD[32+rsp],xmm8
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8:
+ movdqa XMMWORD[48+rsp],xmm9
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9:
+ movdqa XMMWORD[64+rsp],xmm10
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10:
+ movdqa XMMWORD[80+rsp],xmm11
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11:
+ movdqa XMMWORD[96+rsp],xmm12
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12:
+ movdqa XMMWORD[112+rsp],xmm13
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13:
+ movdqa XMMWORD[128+rsp],xmm14
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14:
+ movdqa XMMWORD[144+rsp],xmm15
+$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15:
+
+$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16:
+ vbroadcasti128 ymm0,XMMWORD[$L$bswap_mask]
+
+
+
+ vmovdqu xmm1,XMMWORD[r12]
+ vpshufb xmm1,xmm1,xmm0
+ vbroadcasti128 ymm11,XMMWORD[rsi]
+ vpshufb ymm11,ymm11,ymm0
+
+
+
+ mov r10d,DWORD[240+r9]
+ lea r10d,[((-20))+r10*4]
+
+
+
+
+ lea r11,[96+r10*4+r9]
+ vbroadcasti128 ymm9,XMMWORD[r9]
+ vbroadcasti128 ymm10,XMMWORD[r11]
+
+
+ vpaddd ymm11,ymm11,YMMWORD[$L$ctr_pattern]
+
+
+
+ cmp r8,127
+ jbe NEAR $L$crypt_loop_4x_done__func2
+
+ vmovdqu ymm7,YMMWORD[128+rdi]
+ vmovdqu ymm8,YMMWORD[((128+32))+rdi]
+ALIGN 16
+$L$crypt_loop_4x__func2:
+
+
+
+
+ vmovdqu ymm2,YMMWORD[$L$inc_2blocks]
+ vpshufb ymm12,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+ vpshufb ymm13,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+ vpshufb ymm14,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+ vpshufb ymm15,ymm11,ymm0
+ vpaddd ymm11,ymm11,ymm2
+
+
+ vpxor ymm12,ymm12,ymm9
+ vpxor ymm13,ymm13,ymm9
+ vpxor ymm14,ymm14,ymm9
+ vpxor ymm15,ymm15,ymm9
+
+ cmp r10d,24
+ jl NEAR $L$aes128__func2
+ je NEAR $L$aes192__func2
+
+ vbroadcasti128 ymm2,XMMWORD[((-208))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-192))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+$L$aes192__func2:
+ vbroadcasti128 ymm2,XMMWORD[((-176))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-160))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+$L$aes128__func2:
+
+ vmovdqu ymm3,YMMWORD[rcx]
+ vpshufb ymm3,ymm3,ymm0
+ vmovdqu ymm4,YMMWORD[rdi]
+ vpxor ymm3,ymm3,ymm1
+ vpclmulqdq ymm5,ymm3,ymm4,0x00
+ vpclmulqdq ymm1,ymm3,ymm4,0x11
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm6,ymm2,ymm7,0x00
+
+ vbroadcasti128 ymm2,XMMWORD[((-144))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+
+ vbroadcasti128 ymm2,XMMWORD[((-128))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+
+ vmovdqu ymm3,YMMWORD[32+rcx]
+ vpshufb ymm3,ymm3,ymm0
+ vmovdqu ymm4,YMMWORD[32+rdi]
+ vpclmulqdq ymm2,ymm3,ymm4,0x00
+ vpxor ymm5,ymm5,ymm2
+ vpclmulqdq ymm2,ymm3,ymm4,0x11
+ vpxor ymm1,ymm1,ymm2
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm2,ymm2,ymm7,0x10
+ vpxor ymm6,ymm6,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-112))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+
+ vmovdqu ymm3,YMMWORD[64+rcx]
+ vpshufb ymm3,ymm3,ymm0
+ vmovdqu ymm4,YMMWORD[64+rdi]
+
+ vbroadcasti128 ymm2,XMMWORD[((-96))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vpclmulqdq ymm2,ymm3,ymm4,0x00
+ vpxor ymm5,ymm5,ymm2
+ vpclmulqdq ymm2,ymm3,ymm4,0x11
+ vpxor ymm1,ymm1,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-80))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm2,ymm2,ymm8,0x00
+ vpxor ymm6,ymm6,ymm2
+
+
+ vmovdqu ymm3,YMMWORD[96+rcx]
+ vpshufb ymm3,ymm3,ymm0
+
+ vbroadcasti128 ymm2,XMMWORD[((-64))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vmovdqu ymm4,YMMWORD[96+rdi]
+ vpclmulqdq ymm2,ymm3,ymm4,0x00
+ vpxor ymm5,ymm5,ymm2
+ vpclmulqdq ymm2,ymm3,ymm4,0x11
+ vpxor ymm1,ymm1,ymm2
+ vpunpckhqdq ymm2,ymm3,ymm3
+ vpxor ymm2,ymm2,ymm3
+ vpclmulqdq ymm2,ymm2,ymm8,0x10
+ vpxor ymm6,ymm6,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-48))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+
+ vpxor ymm6,ymm6,ymm5
+ vpxor ymm6,ymm6,ymm1
+
+
+ vbroadcasti128 ymm4,XMMWORD[$L$gfpoly]
+ vpclmulqdq ymm2,ymm4,ymm5,0x01
+ vpshufd ymm5,ymm5,0x4e
+ vpxor ymm6,ymm6,ymm5
+ vpxor ymm6,ymm6,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-32))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+
+ vpclmulqdq ymm2,ymm4,ymm6,0x01
+ vpshufd ymm6,ymm6,0x4e
+ vpxor ymm1,ymm1,ymm6
+ vpxor ymm1,ymm1,ymm2
+
+ vbroadcasti128 ymm2,XMMWORD[((-16))+r11]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ vaesenc ymm14,ymm14,ymm2
+ vaesenc ymm15,ymm15,ymm2
+
+ vextracti128 xmm2,ymm1,1
+ vpxor xmm1,xmm1,xmm2
+
+
+
+ vpxor ymm2,ymm10,YMMWORD[rcx]
+ vpxor ymm3,ymm10,YMMWORD[32+rcx]
+ vpxor ymm5,ymm10,YMMWORD[64+rcx]
+ vpxor ymm6,ymm10,YMMWORD[96+rcx]
+ vaesenclast ymm12,ymm12,ymm2
+ vaesenclast ymm13,ymm13,ymm3
+ vaesenclast ymm14,ymm14,ymm5
+ vaesenclast ymm15,ymm15,ymm6
+ vmovdqu YMMWORD[rdx],ymm12
+ vmovdqu YMMWORD[32+rdx],ymm13
+ vmovdqu YMMWORD[64+rdx],ymm14
+ vmovdqu YMMWORD[96+rdx],ymm15
+
+ sub rcx,-128
+ sub rdx,-128
+ add r8,-128
+ cmp r8,127
+ ja NEAR $L$crypt_loop_4x__func2
+$L$crypt_loop_4x_done__func2:
+
+ test r8,r8
+ jz NEAR $L$done__func2
+
+
+
+
+
+ lea rsi,[128+rdi]
+ sub rsi,r8
+
+
+ vpxor xmm5,xmm5,xmm5
+ vpxor xmm6,xmm6,xmm6
+ vpxor xmm7,xmm7,xmm7
+
+ cmp r8,64
+ jb NEAR $L$lessthan64bytes__func2
+
+
+ vpshufb ymm12,ymm11,ymm0
+ vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+ vpshufb ymm13,ymm11,ymm0
+ vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+ vpxor ymm12,ymm12,ymm9
+ vpxor ymm13,ymm13,ymm9
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_1__func2:
+ vbroadcasti128 ymm2,XMMWORD[rax]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_1__func2
+ vaesenclast ymm12,ymm12,ymm10
+ vaesenclast ymm13,ymm13,ymm10
+
+
+ vmovdqu ymm2,YMMWORD[rcx]
+ vmovdqu ymm3,YMMWORD[32+rcx]
+ vpxor ymm12,ymm12,ymm2
+ vpxor ymm13,ymm13,ymm3
+ vmovdqu YMMWORD[rdx],ymm12
+ vmovdqu YMMWORD[32+rdx],ymm13
+
+
+ vpshufb ymm12,ymm2,ymm0
+ vpshufb ymm13,ymm3,ymm0
+ vpxor ymm12,ymm12,ymm1
+ vmovdqu ymm2,YMMWORD[rsi]
+ vmovdqu ymm3,YMMWORD[32+rsi]
+ vpclmulqdq ymm5,ymm12,ymm2,0x00
+ vpclmulqdq ymm6,ymm12,ymm2,0x01
+ vpclmulqdq ymm4,ymm12,ymm2,0x10
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq ymm7,ymm12,ymm2,0x11
+ vpclmulqdq ymm4,ymm13,ymm3,0x00
+ vpxor ymm5,ymm5,ymm4
+ vpclmulqdq ymm4,ymm13,ymm3,0x01
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq ymm4,ymm13,ymm3,0x10
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq ymm4,ymm13,ymm3,0x11
+ vpxor ymm7,ymm7,ymm4
+
+ add rsi,64
+ add rcx,64
+ add rdx,64
+ sub r8,64
+ jz NEAR $L$reduce__func2
+
+ vpxor xmm1,xmm1,xmm1
+
+
+$L$lessthan64bytes__func2:
+ vpshufb ymm12,ymm11,ymm0
+ vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks]
+ vpshufb ymm13,ymm11,ymm0
+ vpxor ymm12,ymm12,ymm9
+ vpxor ymm13,ymm13,ymm9
+ lea rax,[16+r9]
+$L$vaesenc_loop_tail_2__func2:
+ vbroadcasti128 ymm2,XMMWORD[rax]
+ vaesenc ymm12,ymm12,ymm2
+ vaesenc ymm13,ymm13,ymm2
+ add rax,16
+ cmp r11,rax
+ jne NEAR $L$vaesenc_loop_tail_2__func2
+ vaesenclast ymm12,ymm12,ymm10
+ vaesenclast ymm13,ymm13,ymm10
+
+
+
+
+ cmp r8,32
+ jb NEAR $L$xor_one_block__func2
+ je NEAR $L$xor_two_blocks__func2
+
+$L$xor_three_blocks__func2:
+ vmovdqu ymm2,YMMWORD[rcx]
+ vmovdqu xmm3,XMMWORD[32+rcx]
+ vpxor ymm12,ymm12,ymm2
+ vpxor xmm13,xmm13,xmm3
+ vmovdqu YMMWORD[rdx],ymm12
+ vmovdqu XMMWORD[32+rdx],xmm13
+
+ vpshufb ymm12,ymm2,ymm0
+ vpshufb xmm13,xmm3,xmm0
+ vpxor ymm12,ymm12,ymm1
+ vmovdqu ymm2,YMMWORD[rsi]
+ vmovdqu xmm3,XMMWORD[32+rsi]
+ vpclmulqdq xmm4,xmm13,xmm3,0x00
+ vpxor ymm5,ymm5,ymm4
+ vpclmulqdq xmm4,xmm13,xmm3,0x01
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq xmm4,xmm13,xmm3,0x10
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq xmm4,xmm13,xmm3,0x11
+ vpxor ymm7,ymm7,ymm4
+ jmp NEAR $L$ghash_mul_one_vec_unreduced__func2
+
+$L$xor_two_blocks__func2:
+ vmovdqu ymm2,YMMWORD[rcx]
+ vpxor ymm12,ymm12,ymm2
+ vmovdqu YMMWORD[rdx],ymm12
+ vpshufb ymm12,ymm2,ymm0
+ vpxor ymm12,ymm12,ymm1
+ vmovdqu ymm2,YMMWORD[rsi]
+ jmp NEAR $L$ghash_mul_one_vec_unreduced__func2
+
+$L$xor_one_block__func2:
+ vmovdqu xmm2,XMMWORD[rcx]
+ vpxor xmm12,xmm12,xmm2
+ vmovdqu XMMWORD[rdx],xmm12
+ vpshufb xmm12,xmm2,xmm0
+ vpxor xmm12,xmm12,xmm1
+ vmovdqu xmm2,XMMWORD[rsi]
+
+$L$ghash_mul_one_vec_unreduced__func2:
+ vpclmulqdq ymm4,ymm12,ymm2,0x00
+ vpxor ymm5,ymm5,ymm4
+ vpclmulqdq ymm4,ymm12,ymm2,0x01
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq ymm4,ymm12,ymm2,0x10
+ vpxor ymm6,ymm6,ymm4
+ vpclmulqdq ymm4,ymm12,ymm2,0x11
+ vpxor ymm7,ymm7,ymm4
+
+$L$reduce__func2:
+
+ vbroadcasti128 ymm2,XMMWORD[$L$gfpoly]
+ vpclmulqdq ymm3,ymm2,ymm5,0x01
+ vpshufd ymm5,ymm5,0x4e
+ vpxor ymm6,ymm6,ymm5
+ vpxor ymm6,ymm6,ymm3
+ vpclmulqdq ymm3,ymm2,ymm6,0x01
+ vpshufd ymm6,ymm6,0x4e
+ vpxor ymm7,ymm7,ymm6
+ vpxor ymm7,ymm7,ymm3
+ vextracti128 xmm1,ymm7,1
+ vpxor xmm1,xmm1,xmm7
+
+$L$done__func2:
+
+ vpshufb xmm1,xmm1,xmm0
+ vmovdqu XMMWORD[r12],xmm1
+
+ vzeroupper
+ movdqa xmm6,XMMWORD[rsp]
+ movdqa xmm7,XMMWORD[16+rsp]
+ movdqa xmm8,XMMWORD[32+rsp]
+ movdqa xmm9,XMMWORD[48+rsp]
+ movdqa xmm10,XMMWORD[64+rsp]
+ movdqa xmm11,XMMWORD[80+rsp]
+ movdqa xmm12,XMMWORD[96+rsp]
+ movdqa xmm13,XMMWORD[112+rsp]
+ movdqa xmm14,XMMWORD[128+rsp]
+ movdqa xmm15,XMMWORD[144+rsp]
+ add rsp,160
+ pop r12
+ pop rdi
+ pop rsi
+ ret
+$L$SEH_end_aes_gcm_dec_update_vaes_avx2_17:
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_init_vpclmulqdq_avx2_5 wrt ..imagebase
+ DD $L$SEH_info_gcm_init_vpclmulqdq_avx2_0 wrt ..imagebase
+
+ DD $L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_gmult_vpclmulqdq_avx2_5 wrt ..imagebase
+ DD $L$SEH_info_gcm_gmult_vpclmulqdq_avx2_0 wrt ..imagebase
+
+ DD $L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_ghash_vpclmulqdq_avx2_8 wrt ..imagebase
+ DD $L$SEH_info_gcm_ghash_vpclmulqdq_avx2_0 wrt ..imagebase
+
+ DD $L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 wrt ..imagebase
+ DD $L$SEH_end_aes_gcm_enc_update_vaes_avx2_17 wrt ..imagebase
+ DD $L$SEH_info_aes_gcm_enc_update_vaes_avx2_0 wrt ..imagebase
+
+ DD $L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 wrt ..imagebase
+ DD $L$SEH_end_aes_gcm_dec_update_vaes_avx2_17 wrt ..imagebase
+ DD $L$SEH_info_aes_gcm_dec_update_vaes_avx2_0 wrt ..imagebase
+
+
+section .xdata rdata align=8
+ALIGN 4
+$L$SEH_info_gcm_init_vpclmulqdq_avx2_0:
+ DB 1
+ DB $L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1
+ DB 3
+ DB 0
+ DB $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1
+ DB 34
+
+ DW 0
+$L$SEH_info_gcm_gmult_vpclmulqdq_avx2_0:
+ DB 1
+ DB $L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1
+ DB 3
+ DB 0
+ DB $L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1
+ DB 34
+
+ DW 0
+$L$SEH_info_gcm_ghash_vpclmulqdq_avx2_0:
+ DB 1
+ DB $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+ DB 9
+ DB 0
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+ DB 152
+ DW 3
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+ DB 136
+ DW 2
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+ DB 120
+ DW 1
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1
+ DB 130
+
+ DW 0
+$L$SEH_info_aes_gcm_enc_update_vaes_avx2_0:
+ DB 1
+ DB $L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 25
+ DB 0
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 248
+ DW 9
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 232
+ DW 8
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 216
+ DW 7
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 200
+ DW 6
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 184
+ DW 5
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 168
+ DW 4
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 152
+ DW 3
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 136
+ DW 2
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 120
+ DW 1
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 1
+ DW 20
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 192
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 112
+ DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1
+ DB 96
+
+ DW 0
+$L$SEH_info_aes_gcm_dec_update_vaes_avx2_0:
+ DB 1
+ DB $L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 25
+ DB 0
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 248
+ DW 9
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 232
+ DW 8
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 216
+ DW 7
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 200
+ DW 6
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 184
+ DW 5
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 168
+ DW 4
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 152
+ DW 3
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 136
+ DW 2
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 120
+ DW 1
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 104
+ DW 0
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 1
+ DW 20
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 192
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 112
+ DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1
+ DB 96
+
+ DW 0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/sources.bzl b/gen/sources.bzl
index f91b49e..5af0dd2 100644
--- a/gen/sources.bzl
+++ b/gen/sources.bzl
@@ -104,6 +104,8 @@
bcm_sources_asm = [
"gen/bcm/aes-gcm-avx10-x86_64-apple.S",
"gen/bcm/aes-gcm-avx10-x86_64-linux.S",
+ "gen/bcm/aes-gcm-avx2-x86_64-apple.S",
+ "gen/bcm/aes-gcm-avx2-x86_64-linux.S",
"gen/bcm/aesni-gcm-x86_64-apple.S",
"gen/bcm/aesni-gcm-x86_64-linux.S",
"gen/bcm/aesni-x86-apple.S",
@@ -203,6 +205,7 @@
bcm_sources_nasm = [
"gen/bcm/aes-gcm-avx10-x86_64-win.asm",
+ "gen/bcm/aes-gcm-avx2-x86_64-win.asm",
"gen/bcm/aesni-gcm-x86_64-win.asm",
"gen/bcm/aesni-x86-win.asm",
"gen/bcm/aesni-x86_64-win.asm",
diff --git a/gen/sources.cmake b/gen/sources.cmake
index 369a9e6..bbbb9c2 100644
--- a/gen/sources.cmake
+++ b/gen/sources.cmake
@@ -110,6 +110,8 @@
gen/bcm/aes-gcm-avx10-x86_64-apple.S
gen/bcm/aes-gcm-avx10-x86_64-linux.S
+ gen/bcm/aes-gcm-avx2-x86_64-apple.S
+ gen/bcm/aes-gcm-avx2-x86_64-linux.S
gen/bcm/aesni-gcm-x86_64-apple.S
gen/bcm/aesni-gcm-x86_64-linux.S
gen/bcm/aesni-x86-apple.S
@@ -211,6 +213,7 @@
BCM_SOURCES_NASM
gen/bcm/aes-gcm-avx10-x86_64-win.asm
+ gen/bcm/aes-gcm-avx2-x86_64-win.asm
gen/bcm/aesni-gcm-x86_64-win.asm
gen/bcm/aesni-x86-win.asm
gen/bcm/aesni-x86_64-win.asm
diff --git a/gen/sources.gni b/gen/sources.gni
index d9862d9..b5c3d54 100644
--- a/gen/sources.gni
+++ b/gen/sources.gni
@@ -104,6 +104,8 @@
bcm_sources_asm = [
"gen/bcm/aes-gcm-avx10-x86_64-apple.S",
"gen/bcm/aes-gcm-avx10-x86_64-linux.S",
+ "gen/bcm/aes-gcm-avx2-x86_64-apple.S",
+ "gen/bcm/aes-gcm-avx2-x86_64-linux.S",
"gen/bcm/aesni-gcm-x86_64-apple.S",
"gen/bcm/aesni-gcm-x86_64-linux.S",
"gen/bcm/aesni-x86-apple.S",
@@ -203,6 +205,7 @@
bcm_sources_nasm = [
"gen/bcm/aes-gcm-avx10-x86_64-win.asm",
+ "gen/bcm/aes-gcm-avx2-x86_64-win.asm",
"gen/bcm/aesni-gcm-x86_64-win.asm",
"gen/bcm/aesni-x86-win.asm",
"gen/bcm/aesni-x86_64-win.asm",
diff --git a/gen/sources.json b/gen/sources.json
index 1b482e1..c4604c8 100644
--- a/gen/sources.json
+++ b/gen/sources.json
@@ -88,6 +88,8 @@
"asm": [
"gen/bcm/aes-gcm-avx10-x86_64-apple.S",
"gen/bcm/aes-gcm-avx10-x86_64-linux.S",
+ "gen/bcm/aes-gcm-avx2-x86_64-apple.S",
+ "gen/bcm/aes-gcm-avx2-x86_64-linux.S",
"gen/bcm/aesni-gcm-x86_64-apple.S",
"gen/bcm/aesni-gcm-x86_64-linux.S",
"gen/bcm/aesni-x86-apple.S",
@@ -186,6 +188,7 @@
],
"nasm": [
"gen/bcm/aes-gcm-avx10-x86_64-win.asm",
+ "gen/bcm/aes-gcm-avx2-x86_64-win.asm",
"gen/bcm/aesni-gcm-x86_64-win.asm",
"gen/bcm/aesni-x86-win.asm",
"gen/bcm/aesni-x86_64-win.asm",