crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl - boringssl - Git at Google

 #!/usr/bin/env perl
 # Copyright 2024 The BoringSSL Authors
 #
 # Permission to use, copy, modify, and/or distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
 # copyright notice and this permission notice appear in all copies.
 #
 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #
 #------------------------------------------------------------------------------
 #
 # VAES and VPCLMULQDQ optimized AES-GCM for x86_64 (AVX2 version)
 #
 # This is similar to aes-gcm-avx10-x86_64.pl, but it uses AVX2 instead of AVX512
 # / AVX10.  This means it can only use 16 vector registers instead of 32, the
 # maximum vector length is 32 bytes, and some instructions such as vpternlogd
 # and masked loads/stores are unavailable.  However, it is able to run on CPUs
 # that have VAES without AVX512 / AVX10, namely AMD Zen 3 (including "Milan"
 # server processors) and some Intel client CPUs such as Alder Lake.
 #
 # This implementation also uses Karatsuba multiplication instead of schoolbook
 # multiplication for GHASH in its main loop.  This does not help much on Intel,
 # but it improves performance by ~5% on AMD Zen 3 which is the main target for
 # this implementation.  Other factors weighing slightly in favor of Karatsuba
 # multiplication in this implementation are the lower maximum vector length
 # (which means there is space left in the Htable array to cache the halves of
 # the key powers XOR'd together) and the unavailability of the vpternlogd
 # instruction (which helped schoolbook a bit more than Karatsuba).

 use strict;

 my $flavour = shift;
 my $output  = shift;
 if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; }

 my $win64;
 my @argregs;
 if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) {
     $win64   = 1;
     @argregs = ( "%rcx", "%rdx", "%r8", "%r9" );
 }
 else {
     $win64   = 0;
     @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" );
 }

 $0 =~ m/(.*[\/\\])[^\/\\]+$/;
 my $dir = $1;
 my $xlate;
 ( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate )
   or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate )
   or die "can't locate x86_64-xlate.pl";

 open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT = *OUT;

 my $g_cur_func_name;
 my $g_cur_func_uses_seh;
 my @g_cur_func_saved_gpregs;
 my @g_cur_func_saved_xmmregs;

 sub _begin_func {
     my ( $funcname, $uses_seh ) = @_;
     $g_cur_func_name          = $funcname;
     $g_cur_func_uses_seh      = $uses_seh;
     @g_cur_func_saved_gpregs  = ();
     @g_cur_func_saved_xmmregs = ();
     return <<___;
 .globl $funcname
 .type $funcname,\@abi-omnipotent
 .align 32
 $funcname:
     .cfi_startproc
     @{[ $uses_seh ? ".seh_startproc" : "" ]}
     _CET_ENDBR
 ___
 }

 # Push a list of general purpose registers onto the stack.
 sub _save_gpregs {
     my @gpregs = @_;
     my $code   = "";
     die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh;
     die "_save_gpregs can only be called once per function"
       if @g_cur_func_saved_gpregs;
     die "Order must be _save_gpregs, then _save_xmmregs"
       if @g_cur_func_saved_xmmregs;
     @g_cur_func_saved_gpregs = @gpregs;
     for my $reg (@gpregs) {
         $code .= "push $reg\n";
         if ($win64) {
             $code .= ".seh_pushreg $reg\n";
         }
         else {
             $code .= ".cfi_push $reg\n";
         }
     }
     return $code;
 }

 # Push a list of xmm registers onto the stack if the target is Windows.
 sub _save_xmmregs {
     my @xmmregs     = @_;
     my $num_xmmregs = scalar @xmmregs;
     my $code        = "";
     die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh;
     die "_save_xmmregs can only be called once per function"
       if @g_cur_func_saved_xmmregs;
     if ( $win64 and $num_xmmregs > 0 ) {
         @g_cur_func_saved_xmmregs = @xmmregs;
         my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
         my $alloc_size    = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 );
         $code .= "sub \$$alloc_size, %rsp\n";
         $code .= ".seh_stackalloc $alloc_size\n";
         for my $i ( 0 .. $num_xmmregs - 1 ) {
             my $reg_num = $xmmregs[$i];
             my $pos     = 16 * $i;
             $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n";
             $code .= ".seh_savexmm %xmm$reg_num, $pos\n";
         }
     }
     return $code;
 }

 sub _end_func {
     my $code = "";

     # Restore any xmm registers that were saved earlier.
     my $num_xmmregs = scalar @g_cur_func_saved_xmmregs;
     if ( $win64 and $num_xmmregs > 0 ) {
         my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
         my $alloc_size     = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 );
         for my $i ( 0 .. $num_xmmregs - 1 ) {
             my $reg_num = $g_cur_func_saved_xmmregs[$i];
             my $pos     = 16 * $i;
             $code .= "movdqa $pos(%rsp), %xmm$reg_num\n";
         }
         $code .= "add \$$alloc_size, %rsp\n";
     }

     # Restore any general purpose registers that were saved earlier.
     for my $reg ( reverse @g_cur_func_saved_gpregs ) {
         $code .= "pop $reg\n";
         if ( !$win64 ) {
             $code .= ".cfi_pop $reg\n";
         }
     }

     $code .= <<___;
     ret
     @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]}
     .cfi_endproc
     .size   $g_cur_func_name, . - $g_cur_func_name
 ___
     return $code;
 }

 my $code = <<___;
 .section .rodata
 .align 16

     # A shuffle mask that reflects the bytes of 16-byte blocks
 .Lbswap_mask:
     .quad   0x08090a0b0c0d0e0f, 0x0001020304050607

     # This is the GHASH reducing polynomial without its constant term, i.e.
     # x^128 + x^7 + x^2 + x, represented using the backwards mapping
     # between bits and polynomial coefficients.
     #
     # Alternatively, it can be interpreted as the naturally-ordered
     # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
     # "reversed" GHASH reducing polynomial without its x^128 term.
 .Lgfpoly:
     .quad   1, 0xc200000000000000

     # Same as above, but with the (1 << 64) bit set.
 .Lgfpoly_and_internal_carrybit:
     .quad   1, 0xc200000000000001

 .align 32
     # The below constants are used for incrementing the counter blocks.
 .Lctr_pattern:
     .quad   0, 0
     .quad   1, 0
 .Linc_2blocks:
     .quad   2, 0
     .quad   2, 0

 .text
 ___

 # We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the
 # 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication)
 # in the order 8,6,7,5,4,2,3,1.  We do not use Htable[12..15].
 my $NUM_H_POWERS            = 8;
 my $OFFSETOFEND_H_POWERS    = $NUM_H_POWERS * 16;
 my $OFFSETOF_H_POWERS_XORED = $OFFSETOFEND_H_POWERS;

 # Offset to 'rounds' in AES_KEY struct
 my $OFFSETOF_AES_ROUNDS = 240;

 # GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
 # the reduced products in \dst.  Uses schoolbook multiplication.
 sub _ghash_mul {
     my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
     return <<___;
     vpclmulqdq      \$0x00, $a, $b, $t0        # LO = a_L * b_L
     vpclmulqdq      \$0x01, $a, $b, $t1        # MI_0 = a_L * b_H
     vpclmulqdq      \$0x10, $a, $b, $t2        # MI_1 = a_H * b_L
     vpxor           $t2, $t1, $t1              # MI = MI_0 + MI_1
     vpclmulqdq      \$0x01, $t0, $gfpoly, $t2  # LO_L*(x^63 + x^62 + x^57)
     vpshufd         \$0x4e, $t0, $t0           # Swap halves of LO
     vpxor           $t0, $t1, $t1              # Fold LO into MI (part 1)
     vpxor           $t2, $t1, $t1              # Fold LO into MI (part 2)
     vpclmulqdq      \$0x11, $a, $b, $dst       # HI = a_H * b_H
     vpclmulqdq      \$0x01, $t1, $gfpoly, $t0  # MI_L*(x^63 + x^62 + x^57)
     vpshufd         \$0x4e, $t1, $t1           # Swap halves of MI
     vpxor           $t1, $dst, $dst            # Fold MI into HI (part 1)
     vpxor           $t0, $dst, $dst            # Fold MI into HI (part 2)
 ___
 }

 # void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]);
 #
 # Initialize |Htable| with powers of the GHASH subkey |H|.
 #
 # We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the
 # 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication)
 # in the order 8,6,7,5,4,2,3,1.  We do not use Htable[12..15].
 $code .= _begin_func "gcm_init_vpclmulqdq_avx2", 1;
 {
     my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ];
     my ( $TMP0,   $TMP0_XMM )   = ( "%ymm0", "%xmm0" );
     my ( $TMP1,   $TMP1_XMM )   = ( "%ymm1", "%xmm1" );
     my ( $TMP2,   $TMP2_XMM )   = ( "%ymm2", "%xmm2" );
     my ( $H_CUR,  $H_CUR_XMM )  = ( "%ymm3", "%xmm3" );
     my ( $H_CUR2, $H_CUR2_XMM ) = ( "%ymm4", "%xmm4" );
     my ( $H_INC,  $H_INC_XMM )  = ( "%ymm5", "%xmm5" );
     my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm6", "%xmm6" );

     $code .= <<___;
     @{[ _save_xmmregs (6) ]}
     .seh_endprologue

     # Load the byte-reflected hash subkey.  BoringSSL provides it in
     # byte-reflected form except the two halves are in the wrong order.
     vpshufd         \$0x4e, ($H_PTR), $H_CUR_XMM

     # Finish preprocessing the byte-reflected hash subkey by multiplying it by
     # x^-1 ("standard" interpretation of polynomial coefficients) or
     # equivalently x^1 (natural interpretation).  This gets the key into a
     # format that avoids having to bit-reflect the data blocks later.
     vpshufd         \$0xd3, $H_CUR_XMM, $TMP0_XMM
     vpsrad          \$31, $TMP0_XMM, $TMP0_XMM
     vpaddq          $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM
     vpand           .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $TMP0_XMM
     vpxor           $TMP0_XMM, $H_CUR_XMM, $H_CUR_XMM

     vbroadcasti128  .Lgfpoly(%rip), $GFPOLY

     # Square H^1 to get H^2.
     @{[ _ghash_mul  $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM,
                     $TMP0_XMM, $TMP1_XMM, $TMP2_XMM ]}

     # Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2].
     vinserti128     \$1, $H_CUR_XMM, $H_INC, $H_CUR
     vinserti128     \$1, $H_INC_XMM, $H_INC, $H_INC

     # Compute H_CUR2 = [H^4, H^3].
     @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}

     # Store [H^2, H^1] and [H^4, H^3].
     vmovdqu         $H_CUR, 3*32($HTABLE)
     vmovdqu         $H_CUR2, 2*32($HTABLE)

     # For Karatsuba multiplication: compute and store the two 64-bit halves of
     # each key power XOR'd together.  Order is 4,2,3,1.
     vpunpcklqdq     $H_CUR, $H_CUR2, $TMP0
     vpunpckhqdq     $H_CUR, $H_CUR2, $TMP1
     vpxor           $TMP1, $TMP0, $TMP0
     vmovdqu         $TMP0, $OFFSETOF_H_POWERS_XORED+32($HTABLE)

     # Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7].
     @{[ _ghash_mul  $H_INC, $H_CUR2, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
     @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
     vmovdqu         $H_CUR, 1*32($HTABLE)
     vmovdqu         $H_CUR2, 0*32($HTABLE)

     # Again, compute and store the two 64-bit halves of each key power XOR'd
     # together.  Order is 8,6,7,5.
     vpunpcklqdq     $H_CUR, $H_CUR2, $TMP0
     vpunpckhqdq     $H_CUR, $H_CUR2, $TMP1
     vpxor           $TMP1, $TMP0, $TMP0
     vmovdqu         $TMP0, $OFFSETOF_H_POWERS_XORED($HTABLE)

     vzeroupper
 ___
 }
 $code .= _end_func;

 # Do one step of the GHASH update of four vectors of data blocks.
 #   $i: the step to do, 0 through 9
 #   $ghashdata_ptr: pointer to the data blocks (ciphertext or AAD)
 #   $htable: pointer to the Htable for the key
 #   $bswap_mask: mask for reflecting the bytes of blocks
 #   $h_pow[2-1]_xored: XOR'd key powers cached from Htable
 #   $tmp[0-2]: temporary registers.  $tmp[1-2] must be preserved across steps.
 #   $lo, $mi: working state for this macro that must be preserved across steps
 #   $ghash_acc: the GHASH accumulator (input/output)
 sub _ghash_step_4x {
     my (
         $i,            $ghashdata_ptr, $htable, $bswap_mask,
         $h_pow2_xored, $h_pow1_xored,  $tmp0,   $tmp0_xmm,
         $tmp1,         $tmp2,          $lo,     $mi,
         $ghash_acc,    $ghash_acc_xmm
     ) = @_;
     my ( $hi, $hi_xmm ) = ( $ghash_acc, $ghash_acc_xmm );    # alias
     if ( $i == 0 ) {
         return <<___;
         # First vector
         vmovdqu         0*32($ghashdata_ptr), $tmp1
         vpshufb         $bswap_mask, $tmp1, $tmp1
         vmovdqu         0*32($htable), $tmp2
         vpxor           $ghash_acc, $tmp1, $tmp1
         vpclmulqdq      \$0x00, $tmp2, $tmp1, $lo
         vpclmulqdq      \$0x11, $tmp2, $tmp1, $hi
         vpunpckhqdq     $tmp1, $tmp1, $tmp0
         vpxor           $tmp1, $tmp0, $tmp0
         vpclmulqdq      \$0x00, $h_pow2_xored, $tmp0, $mi
 ___
     }
     elsif ( $i == 1 ) {
         return <<___;
 ___
     }
     elsif ( $i == 2 ) {
         return <<___;
         # Second vector
         vmovdqu         1*32($ghashdata_ptr), $tmp1
         vpshufb         $bswap_mask, $tmp1, $tmp1
         vmovdqu         1*32($htable), $tmp2
         vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
         vpxor           $tmp0, $lo, $lo
         vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
         vpxor           $tmp0, $hi, $hi
         vpunpckhqdq     $tmp1, $tmp1, $tmp0
         vpxor           $tmp1, $tmp0, $tmp0
         vpclmulqdq      \$0x10, $h_pow2_xored, $tmp0, $tmp0
         vpxor           $tmp0, $mi, $mi
 ___
     }
     elsif ( $i == 3 ) {
         return <<___;
         # Third vector
         vmovdqu         2*32($ghashdata_ptr), $tmp1
         vpshufb         $bswap_mask, $tmp1, $tmp1
         vmovdqu         2*32($htable), $tmp2
 ___
     }
     elsif ( $i == 4 ) {
         return <<___;
         vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
         vpxor           $tmp0, $lo, $lo
         vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
         vpxor           $tmp0, $hi, $hi
 ___
     }
     elsif ( $i == 5 ) {
         return <<___;
         vpunpckhqdq     $tmp1, $tmp1, $tmp0
         vpxor           $tmp1, $tmp0, $tmp0
         vpclmulqdq      \$0x00, $h_pow1_xored, $tmp0, $tmp0
         vpxor           $tmp0, $mi, $mi

         # Fourth vector
         vmovdqu         3*32($ghashdata_ptr), $tmp1
         vpshufb         $bswap_mask, $tmp1, $tmp1
 ___
     }
     elsif ( $i == 6 ) {
         return <<___;
         vmovdqu         3*32($htable), $tmp2
         vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
         vpxor           $tmp0, $lo, $lo
         vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
         vpxor           $tmp0, $hi, $hi
         vpunpckhqdq     $tmp1, $tmp1, $tmp0
         vpxor           $tmp1, $tmp0, $tmp0
         vpclmulqdq      \$0x10, $h_pow1_xored, $tmp0, $tmp0
         vpxor           $tmp0, $mi, $mi
 ___
     }
     elsif ( $i == 7 ) {
         return <<___;
         # Finalize 'mi' following Karatsuba multiplication.
         vpxor           $lo, $mi, $mi
         vpxor           $hi, $mi, $mi

         # Fold lo into mi.
         vbroadcasti128  .Lgfpoly(%rip), $tmp2
         vpclmulqdq      \$0x01, $lo, $tmp2, $tmp0
         vpshufd         \$0x4e, $lo, $lo
         vpxor           $lo, $mi, $mi
         vpxor           $tmp0, $mi, $mi
 ___
     }
     elsif ( $i == 8 ) {
         return <<___;
         # Fold mi into hi.
         vpclmulqdq      \$0x01, $mi, $tmp2, $tmp0
         vpshufd         \$0x4e, $mi, $mi
         vpxor           $mi, $hi, $hi
         vpxor           $tmp0, $hi, $hi
 ___
     }
     elsif ( $i == 9 ) {
         return <<___;
         vextracti128    \$1, $hi, $tmp0_xmm
         vpxor           $tmp0_xmm, $hi_xmm, $ghash_acc_xmm
 ___
     }
 }

 sub _ghash_4x {
     my $code = "";
     for my $i ( 0 .. 9 ) {
         $code .= _ghash_step_4x $i, @_;
     }
     return $code;
 }

 # void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]);
 $code .= _begin_func "gcm_gmult_vpclmulqdq_avx2", 1;
 {
     my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ];
     my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
       map( "%xmm$_", ( 0 .. 6 ) );

     $code .= <<___;
     @{[ _save_xmmregs (6) ]}
     .seh_endprologue

     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC
     vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK
     vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
     vmovdqu         .Lgfpoly(%rip), $GFPOLY
     vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC

     @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}

     vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
     vmovdqu         $GHASH_ACC, ($GHASH_ACC_PTR)
 ___
 }
 $code .= _end_func;

 # void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16],
 #                                const uint8_t *in, size_t len);
 #
 # Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
 # by |in| and |len|.  |len| must be a multiple of 16.
 #
 # This function handles large amounts of AAD efficiently, while also keeping the
 # overhead low for small amounts of AAD which is the common case.  TLS uses less
 # than one block of AAD, but (uncommonly) other use cases may use much more.
 $code .= _begin_func "gcm_ghash_vpclmulqdq_avx2", 1;
 {
     # Function arguments
     my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];

     # Additional local variables
     my ( $TMP0,       $TMP0_XMM )       = ( "%ymm0", "%xmm0" );
     my ( $TMP1,       $TMP1_XMM )       = ( "%ymm1", "%xmm1" );
     my ( $TMP2,       $TMP2_XMM )       = ( "%ymm2", "%xmm2" );
     my ( $LO,         $LO_XMM )         = ( "%ymm3", "%xmm3" );
     my ( $MI,         $MI_XMM )         = ( "%ymm4", "%xmm4" );
     my ( $GHASH_ACC,  $GHASH_ACC_XMM )  = ( "%ymm5", "%xmm5" );
     my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" );
     my ( $GFPOLY,     $GFPOLY_XMM )     = ( "%ymm7", "%xmm7" );
     my $H_POW2_XORED = "%ymm8";
     my $H_POW1_XORED = "%ymm9";

     $code .= <<___;
     @{[ _save_xmmregs (6 .. 9) ]}
     .seh_endprologue

     vbroadcasti128  .Lbswap_mask(%rip), $BSWAP_MASK
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vbroadcasti128  .Lgfpoly(%rip), $GFPOLY

     # Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128.
     cmp             \$32, $AADLEN
     jb              .Lghash_lastblock

     cmp             \$127, $AADLEN
     jbe             .Lghash_loop_1x

     # Update GHASH with 128 bytes of AAD at a time.
     vmovdqu         $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED
     vmovdqu         $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED
 .Lghash_loop_4x:
     @{[ _ghash_4x   $AAD, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, $H_POW1_XORED,
                     $TMP0, $TMP0_XMM, $TMP1, $TMP2, $LO, $MI, $GHASH_ACC,
                     $GHASH_ACC_XMM ]}
     sub             \$-128, $AAD  # 128 is 4 bytes, -128 is 1 byte
     add             \$-128, $AADLEN
     cmp             \$127, $AADLEN
     ja              .Lghash_loop_4x

     # Update GHASH with 32 bytes of AAD at a time.
     cmp             \$32, $AADLEN
     jb              .Lghash_loop_1x_done
 .Lghash_loop_1x:
     vmovdqu         ($AAD), $TMP0
     vpshufb         $BSWAP_MASK, $TMP0, $TMP0
     vpxor           $TMP0, $GHASH_ACC, $GHASH_ACC
     vmovdqu         $OFFSETOFEND_H_POWERS-32($HTABLE), $TMP0
     @{[ _ghash_mul  $TMP0, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $TMP1, $TMP2, $LO ]}
     vextracti128    \$1, $GHASH_ACC, $TMP0_XMM
     vpxor           $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     add             \$32, $AAD
     sub             \$32, $AADLEN
     cmp             \$32, $AADLEN
     jae             .Lghash_loop_1x
 .Lghash_loop_1x_done:
     # Issue the vzeroupper that is needed after using ymm registers.  Do it here
     # instead of at the end, to minimize overhead for small AADLEN.
     vzeroupper

     # Update GHASH with the remaining 16-byte block if any.
 .Lghash_lastblock:
     test            $AADLEN, $AADLEN
     jz              .Lghash_done
     vmovdqu         ($AAD), $TMP0_XMM
     vpshufb         $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM
     vpxor           $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM
     @{[ _ghash_mul  $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
                     $TMP1_XMM, $TMP2_XMM, $LO_XMM ]}

 .Lghash_done:
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
 ___
 }
 $code .= _end_func;

 sub _vaesenc_4x {
     my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_;
     return <<___;
     vaesenc         $round_key, $aesdata0, $aesdata0
     vaesenc         $round_key, $aesdata1, $aesdata1
     vaesenc         $round_key, $aesdata2, $aesdata2
     vaesenc         $round_key, $aesdata3, $aesdata3
 ___
 }

 sub _ctr_begin_4x {
     my (
         $le_ctr,   $bswap_mask, $rndkey0,  $aesdata0,
         $aesdata1, $aesdata2,   $aesdata3, $tmp
     ) = @_;
     return <<___;
     # Increment le_ctr four times to generate four vectors of little-endian
     # counter blocks, swap each to big-endian, and store them in aesdata[0-3].
     vmovdqu         .Linc_2blocks(%rip), $tmp
     vpshufb         $bswap_mask, $le_ctr, $aesdata0
     vpaddd          $tmp, $le_ctr, $le_ctr
     vpshufb         $bswap_mask, $le_ctr, $aesdata1
     vpaddd          $tmp, $le_ctr, $le_ctr
     vpshufb         $bswap_mask, $le_ctr, $aesdata2
     vpaddd          $tmp, $le_ctr, $le_ctr
     vpshufb         $bswap_mask, $le_ctr, $aesdata3
     vpaddd          $tmp, $le_ctr, $le_ctr

     # AES "round zero": XOR in the zero-th round key.
     vpxor           $rndkey0, $aesdata0, $aesdata0
     vpxor           $rndkey0, $aesdata1, $aesdata1
     vpxor           $rndkey0, $aesdata2, $aesdata2
     vpxor           $rndkey0, $aesdata3, $aesdata3
 ___
 }

 # Do the last AES round for four vectors of counter blocks, XOR four vectors of
 # source data with the resulting keystream blocks, and write the result to the
 # destination buffer.  The implementation differs slightly as it takes advantage
 # of the property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) to reduce
 # latency, but it has the same effect.
 sub _aesenclast_and_xor_4x {
     my (
         $src,      $dst,      $rndkeylast, $aesdata0,
         $aesdata1, $aesdata2, $aesdata3,   $t0,
         $t1,       $t2,       $t3
     ) = @_;
     return <<___;
     vpxor           0*32($src), $rndkeylast, $t0
     vpxor           1*32($src), $rndkeylast, $t1
     vpxor           2*32($src), $rndkeylast, $t2
     vpxor           3*32($src), $rndkeylast, $t3
     vaesenclast     $t0, $aesdata0, $aesdata0
     vaesenclast     $t1, $aesdata1, $aesdata1
     vaesenclast     $t2, $aesdata2, $aesdata2
     vaesenclast     $t3, $aesdata3, $aesdata3
     vmovdqu         $aesdata0, 0*32($dst)
     vmovdqu         $aesdata1, 1*32($dst)
     vmovdqu         $aesdata2, 2*32($dst)
     vmovdqu         $aesdata3, 3*32($dst)
 ___
 }

 my $g_update_macro_expansion_count = 0;

 # void aes_gcm_{enc,dec}_update_vaes_avx2(const uint8_t *in, uint8_t *out,
 #                                         size_t len, const AES_KEY *key,
 #                                         const uint8_t ivec[16],
 #                                         const u128 Htable[16],
 #                                         uint8_t Xi[16]);
 #
 # This macro generates a GCM encryption or decryption update function with the
 # above prototype (with \enc selecting which one).  The function computes the
 # next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and
 # writes the resulting encrypted or decrypted data to |out|.  It also updates
 # the GHASH accumulator |Xi| using the next |len| ciphertext bytes.
 #
 # |len| must be a multiple of 16.  The caller must do any buffering needed to
 # ensure this.  Both in-place and out-of-place en/decryption are supported.
 #
 # |ivec| must give the current counter in big-endian format.  This function
 # loads the counter from |ivec| and increments the loaded counter as needed, but
 # it does *not* store the updated counter back to |ivec|.  The caller must
 # update |ivec| if any more data segments follow.  Internally, only the low
 # 32-bit word of the counter is incremented, following the GCM standard.
 sub _aes_gcm_update {
     my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count;
     my ($enc)              = @_;
     my $code               = "";

     # Function arguments
     my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR )
       = $win64
       ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" )
       : ( @argregs[ 0 .. 5 ], "%r12" );

     # Additional local variables.
     # %rax is used as a temporary register.  BE_CTR_PTR is also available as a
     # temporary register after the counter is loaded.

     # AES key length in bytes
     my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" );

     # Pointer to the last AES round key for the chosen AES variant
     my $RNDKEYLAST_PTR = "%r11";

     # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
     # using vpshufb, copied to all 128-bit lanes.
     my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm0", "%xmm0" );

     # GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
     # only the lowest 128-bit lane can be nonzero.  When not fully reduced,
     # more than one lane may be used, and they need to be XOR'd together.
     my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm1", "%xmm1" );

     # TMP[0-2] are temporary registers.
     my ( $TMP0, $TMP0_XMM ) = ( "%ymm2", "%xmm2" );
     my ( $TMP1, $TMP1_XMM ) = ( "%ymm3", "%xmm3" );
     my ( $TMP2, $TMP2_XMM ) = ( "%ymm4", "%xmm4" );

     # LO and MI are used to accumulate unreduced GHASH products.
     my ( $LO, $LO_XMM ) = ( "%ymm5", "%xmm5" );
     my ( $MI, $MI_XMM ) = ( "%ymm6", "%xmm6" );

     # Cached key powers from Htable
     my ( $H_POW2_XORED, $H_POW2_XORED_XMM ) = ( "%ymm7", "%xmm7" );
     my ( $H_POW1_XORED, $H_POW1_XORED_XMM ) = ( "%ymm8", "%xmm8" );

     # RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one.
     my $RNDKEY0    = "%ymm9";
     my $RNDKEYLAST = "%ymm10";

     # LE_CTR contains the next set of little-endian counter blocks.
     my $LE_CTR = "%ymm11";

     # AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
     my ( $AESDATA0, $AESDATA0_XMM ) = ( "%ymm12", "%xmm12" );
     my ( $AESDATA1, $AESDATA1_XMM ) = ( "%ymm13", "%xmm13" );
     my ( $AESDATA2, $AESDATA2_XMM ) = ( "%ymm14", "%xmm14" );
     my ( $AESDATA3, $AESDATA3_XMM ) = ( "%ymm15", "%xmm15" );
     my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 );

     my @ghash_4x_args = (
         $enc ? $DST : $SRC, $HTABLE, $BSWAP_MASK, $H_POW2_XORED,
         $H_POW1_XORED,      $TMP0,   $TMP0_XMM,   $TMP1,
         $TMP2,              $LO,     $MI,         $GHASH_ACC,
         $GHASH_ACC_XMM
     );

     if ($win64) {
         $code .= <<___;
         @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]}
         mov             64(%rsp), $BE_CTR_PTR     # arg5
         mov             72(%rsp), $HTABLE         # arg6
         mov             80(%rsp), $GHASH_ACC_PTR  # arg7
         @{[ _save_xmmregs (6 .. 15) ]}
         .seh_endprologue
 ___
     }
     else {
         $code .= <<___;
         @{[ _save_gpregs $GHASH_ACC_PTR ]}
         mov             16(%rsp), $GHASH_ACC_PTR  # arg7
 ___
     }

     if ($enc) {
         $code .= <<___;
 #ifdef BORINGSSL_DISPATCH_TEST
         .extern BORINGSSL_function_hit
         movb \$1,BORINGSSL_function_hit+8(%rip)
 #endif
 ___
     }
     $code .= <<___;
     vbroadcasti128  .Lbswap_mask(%rip), $BSWAP_MASK

     # Load the GHASH accumulator and the starting counter.
     # BoringSSL passes these values in big endian format.
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vbroadcasti128  ($BE_CTR_PTR), $LE_CTR
     vpshufb         $BSWAP_MASK, $LE_CTR, $LE_CTR

     # Load the AES key length in bytes.  BoringSSL stores number of rounds
     # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20.
     movl            $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN
     lea             -20(,$AESKEYLEN,4), $AESKEYLEN

     # Make RNDKEYLAST_PTR point to the last AES round key.  This is the
     # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
     # respectively.  Then load the zero-th and last round keys.
     lea             6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR
     vbroadcasti128  ($AESKEY), $RNDKEY0
     vbroadcasti128  ($RNDKEYLAST_PTR), $RNDKEYLAST

     # Finish initializing LE_CTR by adding 1 to the second block.
     vpaddd          .Lctr_pattern(%rip), $LE_CTR, $LE_CTR

     # If there are at least 128 bytes of data, then continue into the loop that
     # processes 128 bytes of data at a time.  Otherwise skip it.
     cmp             \$127, $DATALEN
     jbe             .Lcrypt_loop_4x_done$local_label_suffix

     vmovdqu         $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED
     vmovdqu         $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED
 ___

     # Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time.

     if ($enc) {
         $code .= <<___;
         # Encrypt the first 4 vectors of plaintext blocks.
         @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]}
         lea             16($AESKEY), %rax
 .Lvaesenc_loop_first_4_vecs$local_label_suffix:
         vbroadcasti128  (%rax), $TMP0
         @{[ _vaesenc_4x $TMP0, @AESDATA ]}
         add             \$16, %rax
         cmp             %rax, $RNDKEYLAST_PTR
         jne             .Lvaesenc_loop_first_4_vecs$local_label_suffix
         @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA,
                                    $TMP0, $TMP1, $LO, $MI ]}
         sub             \$-128, $SRC  # 128 is 4 bytes, -128 is 1 byte
         add             \$-128, $DATALEN
         cmp             \$127, $DATALEN
         jbe             .Lghash_last_ciphertext_4x$local_label_suffix
 ___
     }

     $code .= <<___;
 .align 16
 .Lcrypt_loop_4x$local_label_suffix:

     # Start the AES encryption of the counter blocks.
     @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]}
     cmp             \$24, $AESKEYLEN
     jl              .Laes128$local_label_suffix
     je              .Laes192$local_label_suffix
     # AES-256
     vbroadcasti128 -13*16($RNDKEYLAST_PTR), $TMP0
     @{[ _vaesenc_4x $TMP0, @AESDATA ]}
     vbroadcasti128 -12*16($RNDKEYLAST_PTR), $TMP0
     @{[ _vaesenc_4x $TMP0, @AESDATA ]}
 .Laes192$local_label_suffix:
     vbroadcasti128 -11*16($RNDKEYLAST_PTR), $TMP0
     @{[ _vaesenc_4x $TMP0, @AESDATA ]}
     vbroadcasti128 -10*16($RNDKEYLAST_PTR), $TMP0
     @{[ _vaesenc_4x $TMP0, @AESDATA ]}
 .Laes128$local_label_suffix:
 ___

     # Prefetch the source data 512 bytes ahead into the L1 data cache, to
     # improve performance when the hardware prefetcher is disabled.  Assumes the
     # L1 data cache line size is 64 bytes (de facto standard on x86_64).
     $code .= "prefetcht0 512($SRC)\n";
     $code .= "prefetcht0 512+64($SRC)\n";

     # Finish the AES encryption of the counter blocks in AESDATA[0-3],
     # interleaved with the GHASH update of the ciphertext blocks.
     for my $i ( reverse 1 .. 9 ) {
         $code .= <<___;
         @{[ _ghash_step_4x 9-$i, @ghash_4x_args ]}
         vbroadcasti128  -$i*16($RNDKEYLAST_PTR), $TMP0
         @{[ _vaesenc_4x $TMP0, @AESDATA ]}
 ___
     }
     $code .= <<___;
     @{[ _ghash_step_4x 9, @ghash_4x_args ]}

     @{[ $enc ? "sub \$-128, $DST" : "" ]}  # 128 is 4 bytes, -128 is 1 byte
     @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA,
                                $TMP0, $TMP1, $LO, $MI ]}
     sub             \$-128, $SRC
     @{[ !$enc ? "sub \$-128, $DST" : "" ]}
     add             \$-128, $DATALEN
     cmp             \$127, $DATALEN
     ja              .Lcrypt_loop_4x$local_label_suffix
 ___

     if ($enc) {

         # Update GHASH with the last set of ciphertext blocks.
         $code .= <<___;
 .Lghash_last_ciphertext_4x$local_label_suffix:
         @{[ _ghash_4x @ghash_4x_args ]}
         sub             \$-128, $DST
 ___
     }

     my $POWERS_PTR = $BE_CTR_PTR;    # BE_CTR_PTR is free to be reused.
     my ( $HI, $HI_XMM ) = ( $H_POW2_XORED, $H_POW2_XORED_XMM );    # reuse

     $code .= <<___;
 .Lcrypt_loop_4x_done$local_label_suffix:
     # Check whether any data remains.
     test            $DATALEN, $DATALEN
     jz              .Ldone$local_label_suffix

     # DATALEN is in [16, 32, 48, 64, 80, 96, 112].

     # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
     # is the number of blocks that remain.
     lea             $OFFSETOFEND_H_POWERS($HTABLE), $POWERS_PTR
     sub             $DATALEN, $POWERS_PTR

     # Start collecting the unreduced GHASH intermediate value LO, MI, HI.
     vpxor           $LO_XMM, $LO_XMM, $LO_XMM
     vpxor           $MI_XMM, $MI_XMM, $MI_XMM
     vpxor           $HI_XMM, $HI_XMM, $HI_XMM

     cmp             \$64, $DATALEN
     jb              .Llessthan64bytes$local_label_suffix

     # DATALEN is in [64, 80, 96, 112].  Encrypt two vectors of counter blocks.
     vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA0
     vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
     vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA1
     vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
     vpxor           $RNDKEY0, $AESDATA0, $AESDATA0
     vpxor           $RNDKEY0, $AESDATA1, $AESDATA1
     lea             16($AESKEY), %rax
 .Lvaesenc_loop_tail_1$local_label_suffix:
     vbroadcasti128  (%rax), $TMP0
     vaesenc         $TMP0, $AESDATA0, $AESDATA0
     vaesenc         $TMP0, $AESDATA1, $AESDATA1
     add             \$16, %rax
     cmp             %rax, $RNDKEYLAST_PTR
     jne             .Lvaesenc_loop_tail_1$local_label_suffix
     vaesenclast     $RNDKEYLAST, $AESDATA0, $AESDATA0
     vaesenclast     $RNDKEYLAST, $AESDATA1, $AESDATA1

     # XOR the data with the two vectors of keystream blocks.
     vmovdqu         0($SRC), $TMP0
     vmovdqu         32($SRC), $TMP1
     vpxor           $TMP0, $AESDATA0, $AESDATA0
     vpxor           $TMP1, $AESDATA1, $AESDATA1
     vmovdqu         $AESDATA0, 0($DST)
     vmovdqu         $AESDATA1, 32($DST)

     # Update GHASH with two vectors of ciphertext blocks, without reducing.
     vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
     vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA1 : $TMP1 ]}, $AESDATA1
     vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
     vmovdqu         ($POWERS_PTR), $TMP0
     vmovdqu         32($POWERS_PTR), $TMP1
     vpclmulqdq      \$0x00, $TMP0, $AESDATA0, $LO
     vpclmulqdq      \$0x01, $TMP0, $AESDATA0, $MI
     vpclmulqdq      \$0x10, $TMP0, $AESDATA0, $TMP2
     vpxor           $TMP2, $MI, $MI
     vpclmulqdq      \$0x11, $TMP0, $AESDATA0, $HI
     vpclmulqdq      \$0x00, $TMP1, $AESDATA1, $TMP2
     vpxor           $TMP2, $LO, $LO
     vpclmulqdq      \$0x01, $TMP1, $AESDATA1, $TMP2
     vpxor           $TMP2, $MI, $MI
     vpclmulqdq      \$0x10, $TMP1, $AESDATA1, $TMP2
     vpxor           $TMP2, $MI, $MI
     vpclmulqdq      \$0x11, $TMP1, $AESDATA1, $TMP2
     vpxor           $TMP2, $HI, $HI

     add             \$64, $POWERS_PTR
     add             \$64, $SRC
     add             \$64, $DST
     sub             \$64, $DATALEN
     jz              .Lreduce$local_label_suffix

     vpxor           $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM

     # DATALEN is in [16, 32, 48].  Encrypt two last vectors of counter blocks.
 .Llessthan64bytes$local_label_suffix:
     vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA0
     vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
     vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA1
     vpxor           $RNDKEY0, $AESDATA0, $AESDATA0
     vpxor           $RNDKEY0, $AESDATA1, $AESDATA1
     lea             16($AESKEY), %rax
 .Lvaesenc_loop_tail_2$local_label_suffix:
     vbroadcasti128  (%rax), $TMP0
     vaesenc         $TMP0, $AESDATA0, $AESDATA0
     vaesenc         $TMP0, $AESDATA1, $AESDATA1
     add             \$16, %rax
     cmp             %rax, $RNDKEYLAST_PTR
     jne             .Lvaesenc_loop_tail_2$local_label_suffix
     vaesenclast     $RNDKEYLAST, $AESDATA0, $AESDATA0
     vaesenclast     $RNDKEYLAST, $AESDATA1, $AESDATA1

     # XOR the remaining data with the keystream blocks, and update GHASH with
     # the remaining ciphertext blocks without reducing.

     cmp             \$32, $DATALEN
     jb              .Lxor_one_block$local_label_suffix
     je              .Lxor_two_blocks$local_label_suffix

 .Lxor_three_blocks$local_label_suffix:
     vmovdqu         0($SRC), $TMP0
     vmovdqu         32($SRC), $TMP1_XMM
     vpxor           $TMP0, $AESDATA0, $AESDATA0
     vpxor           $TMP1_XMM, $AESDATA1_XMM, $AESDATA1_XMM
     vmovdqu         $AESDATA0, 0($DST)
     vmovdqu         $AESDATA1_XMM, 32($DST)

     vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
     vpshufb         $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA1_XMM : $TMP1_XMM ]}, $AESDATA1_XMM
     vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
     vmovdqu         ($POWERS_PTR), $TMP0
     vmovdqu         32($POWERS_PTR), $TMP1_XMM
     vpclmulqdq      \$0x00, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
     vpxor           $TMP2, $LO, $LO
     vpclmulqdq      \$0x01, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
     vpxor           $TMP2, $MI, $MI
     vpclmulqdq      \$0x10, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
     vpxor           $TMP2, $MI, $MI
     vpclmulqdq      \$0x11, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
     vpxor           $TMP2, $HI, $HI
     jmp             .Lghash_mul_one_vec_unreduced$local_label_suffix

 .Lxor_two_blocks$local_label_suffix:
     vmovdqu         ($SRC), $TMP0
     vpxor           $TMP0, $AESDATA0, $AESDATA0
     vmovdqu         $AESDATA0, ($DST)
     vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
     vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
     vmovdqu         ($POWERS_PTR), $TMP0
     jmp             .Lghash_mul_one_vec_unreduced$local_label_suffix

 .Lxor_one_block$local_label_suffix:
     vmovdqu         ($SRC), $TMP0_XMM
     vpxor           $TMP0_XMM, $AESDATA0_XMM, $AESDATA0_XMM
     vmovdqu         $AESDATA0_XMM, ($DST)
     vpshufb         $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA0_XMM : $TMP0_XMM ]}, $AESDATA0_XMM
     vpxor           $GHASH_ACC_XMM, $AESDATA0_XMM, $AESDATA0_XMM
     vmovdqu         ($POWERS_PTR), $TMP0_XMM

 .Lghash_mul_one_vec_unreduced$local_label_suffix:
     vpclmulqdq      \$0x00, $TMP0, $AESDATA0, $TMP2
     vpxor           $TMP2, $LO, $LO
     vpclmulqdq      \$0x01, $TMP0, $AESDATA0, $TMP2
     vpxor           $TMP2, $MI, $MI
     vpclmulqdq      \$0x10, $TMP0, $AESDATA0, $TMP2
     vpxor           $TMP2, $MI, $MI
     vpclmulqdq      \$0x11, $TMP0, $AESDATA0, $TMP2
     vpxor           $TMP2, $HI, $HI

 .Lreduce$local_label_suffix:
     # Finally, do the GHASH reduction.
     vbroadcasti128  .Lgfpoly(%rip), $TMP0
     vpclmulqdq      \$0x01, $LO, $TMP0, $TMP1
     vpshufd         \$0x4e, $LO, $LO
     vpxor           $LO, $MI, $MI
     vpxor           $TMP1, $MI, $MI
     vpclmulqdq      \$0x01, $MI, $TMP0, $TMP1
     vpshufd         \$0x4e, $MI, $MI
     vpxor           $MI, $HI, $HI
     vpxor           $TMP1, $HI, $HI
     vextracti128    \$1, $HI, $GHASH_ACC_XMM
     vpxor           $HI_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM

 .Ldone$local_label_suffix:
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)

     vzeroupper
 ___
     return $code;
 }

 $code .= _begin_func "aes_gcm_enc_update_vaes_avx2", 1;
 $code .= _aes_gcm_update 1;
 $code .= _end_func;

 $code .= _begin_func "aes_gcm_dec_update_vaes_avx2", 1;
 $code .= _aes_gcm_update 0;
 $code .= _end_func;

 print $code;
 close STDOUT or die "error closing STDOUT: $!";
 exit 0;