src/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl - boringssl - Git at Google

 #!/usr/bin/env perl
 # Copyright 2024 The BoringSSL Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 #------------------------------------------------------------------------------
 #
 # This is an AES-GCM implementation for x86_64 CPUs that support the following
 # CPU features: VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2.
 #
 # This file is based on aes-gcm-avx10-x86_64.S from the Linux kernel
 # (https://git.kernel.org/linus/b06affb1cb580e13).  The following notable
 # changes have been made:
 #
 # - Relicensed under BoringSSL's preferred license.
 #
 # - Converted from GNU assembler to "perlasm".  This was necessary for
 #   compatibility with BoringSSL's Windows builds which use NASM instead of the
 #   GNU assembler.  It was also necessary for compatibility with the 'delocate'
 #   tool used in BoringSSL's FIPS builds.
 #
 # - Added support for the Windows ABI.
 #
 # - Changed function prototypes to be compatible with what BoringSSL wants.
 #
 # - Removed the optimized finalization function, as BoringSSL doesn't want it.
 #
 # - Added a single-block GHASH multiplication function, as BoringSSL needs this.
 #
 # - Added optimization for large amounts of AAD.
 #
 # - Removed support for maximum vector lengths other than 512 bits.

 use strict;

 my $flavour = shift;
 my $output  = shift;
 if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; }

 my $win64;
 my @argregs;
 if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) {
     $win64   = 1;
     @argregs = ( "%rcx", "%rdx", "%r8", "%r9" );
 }
 else {
     $win64   = 0;
     @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" );
 }

 $0 =~ m/(.*[\/\\])[^\/\\]+$/;
 my $dir = $1;
 my $xlate;
 ( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate )
   or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate )
   or die "can't locate x86_64-xlate.pl";

 open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT = *OUT;

 my $g_cur_func_name;
 my $g_cur_func_uses_seh;
 my @g_cur_func_saved_gpregs;
 my @g_cur_func_saved_xmmregs;

 sub _begin_func {
     my ( $funcname, $uses_seh ) = @_;
     $g_cur_func_name          = $funcname;
     $g_cur_func_uses_seh      = $uses_seh;
     @g_cur_func_saved_gpregs  = ();
     @g_cur_func_saved_xmmregs = ();
     return <<___;
 .globl $funcname
 .type $funcname,\@abi-omnipotent
 .align 32
 $funcname:
     .cfi_startproc
     @{[ $uses_seh ? ".seh_startproc" : "" ]}
     _CET_ENDBR
 ___
 }

 # Push a list of general purpose registers onto the stack.
 sub _save_gpregs {
     my @gpregs = @_;
     my $code   = "";
     die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh;
     die "_save_gpregs can only be called once per function"
       if @g_cur_func_saved_gpregs;
     die "Order must be _save_gpregs, then _save_xmmregs"
       if @g_cur_func_saved_xmmregs;
     @g_cur_func_saved_gpregs = @gpregs;
     for my $reg (@gpregs) {
         $code .= "push $reg\n";
         if ($win64) {
             $code .= ".seh_pushreg $reg\n";
         }
         else {
             $code .= ".cfi_push $reg\n";
         }
     }
     return $code;
 }

 # Push a list of xmm registers onto the stack if the target is Windows.
 sub _save_xmmregs {
     my @xmmregs     = @_;
     my $num_xmmregs = scalar @xmmregs;
     my $code        = "";
     die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh;
     die "_save_xmmregs can only be called once per function"
       if @g_cur_func_saved_xmmregs;
     if ( $win64 and $num_xmmregs > 0 ) {
         @g_cur_func_saved_xmmregs = @xmmregs;
         my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
         my $alloc_size    = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 );
         $code .= "sub \$$alloc_size, %rsp\n";
         $code .= ".seh_stackalloc $alloc_size\n";
         for my $i ( 0 .. $num_xmmregs - 1 ) {
             my $reg_num = $xmmregs[$i];
             my $pos     = 16 * $i;
             $code .= "vmovdqa %xmm$reg_num, $pos(%rsp)\n";
             $code .= ".seh_savexmm %xmm$reg_num, $pos\n";
         }
     }
     return $code;
 }

 sub _end_func {
     my $code = "";

     # Restore any xmm registers that were saved earlier.
     my $num_xmmregs = scalar @g_cur_func_saved_xmmregs;
     if ( $win64 and $num_xmmregs > 0 ) {
         my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
         my $alloc_size     = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 );
         for my $i ( 0 .. $num_xmmregs - 1 ) {
             my $reg_num = $g_cur_func_saved_xmmregs[$i];
             my $pos     = 16 * $i;
             $code .= "vmovdqa $pos(%rsp), %xmm$reg_num\n";
         }
         $code .= "add \$$alloc_size, %rsp\n";
     }

     # Restore any general purpose registers that were saved earlier.
     for my $reg ( reverse @g_cur_func_saved_gpregs ) {
         $code .= "pop $reg\n";
         if ( !$win64 ) {
             $code .= ".cfi_pop $reg\n";
         }
     }

     $code .= <<___;
     ret
     @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]}
     .cfi_endproc
     .size   $g_cur_func_name, . - $g_cur_func_name
 ___
     return $code;
 }

 my $code = <<___;
 .section .rodata
 .align 64

     # A shuffle mask that reflects the bytes of 16-byte blocks
 .Lbswap_mask:
     .quad   0x08090a0b0c0d0e0f, 0x0001020304050607

     # This is the GHASH reducing polynomial without its constant term, i.e.
     # x^128 + x^7 + x^2 + x, represented using the backwards mapping
     # between bits and polynomial coefficients.
     #
     # Alternatively, it can be interpreted as the naturally-ordered
     # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
     # "reversed" GHASH reducing polynomial without its x^128 term.
 .Lgfpoly:
     .quad   1, 0xc200000000000000

     # Same as above, but with the (1 << 64) bit set.
 .Lgfpoly_and_internal_carrybit:
     .quad   1, 0xc200000000000001

     # Values needed to prepare the initial vector of counter blocks.
 .Lctr_pattern:
     .quad   0, 0
     .quad   1, 0
     .quad   2, 0
     .quad   3, 0

     # The number of AES blocks per vector, as a 128-bit value.
 .Linc_4blocks:
     .quad   4, 0

 .text
 ___

 # Number of powers of the hash key stored in the key struct.  The powers are
 # stored from highest (H^NUM_H_POWERS) to lowest (H^1).
 my $NUM_H_POWERS = 16;

 my $OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16;

 # Offset to 'rounds' in AES_KEY struct
 my $OFFSETOF_AES_ROUNDS = 240;

 # The _ghash_mul macro multiplies the 128-bit lanes of \a by the corresponding
 # 128-bit lanes of \b and stores the reduced products in \dst.  \t0, \t1, and
 # \t2 are temporary registers of the same size as \a and \b.
 #
 # The multiplications are done in GHASH's representation of the finite field
 # GF(2^128).  Elements of GF(2^128) are represented as binary polynomials
 # (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial
 # G.  The GCM specification uses G = x^128 + x^7 + x^2 + x + 1.  Addition is
 # just XOR, while multiplication is more complex and has two parts: (a) do
 # carryless multiplication of two 128-bit input polynomials to get a 256-bit
 # intermediate product polynomial, and (b) reduce the intermediate product to
 # 128 bits by adding multiples of G that cancel out terms in it.  (Adding
 # multiples of G doesn't change which field element the polynomial represents.)
 #
 # Unfortunately, the GCM specification maps bits to/from polynomial
 # coefficients backwards from the natural order.  In each byte it specifies the
 # highest bit to be the lowest order polynomial coefficient, *not* the highest!
 # This makes it nontrivial to work with the GHASH polynomials.  We could
 # reflect the bits, but x86 doesn't have an instruction that does that.
 #
 # Instead, we operate on the values without bit-reflecting them.  This *mostly*
 # just works, since XOR and carryless multiplication are symmetric with respect
 # to bit order, but it has some consequences.  First, due to GHASH's byte
 # order, by skipping bit reflection, *byte* reflection becomes necessary to
 # give the polynomial terms a consistent order.  E.g., considering an N-bit
 # value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0
 # through N-1 of the byte-reflected value represent the coefficients of x^(N-1)
 # through x^0, whereas bits 0 through N-1 of the non-byte-reflected value
 # represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked
 # with.  Fortunately, x86's vpshufb instruction can do byte reflection.
 #
 # Second, forgoing the bit reflection causes an extra multiple of x (still
 # using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each
 # multiplication.  This is because an M-bit by N-bit carryless multiplication
 # really produces a (M+N-1)-bit product, but in practice it's zero-extended to
 # M+N bits.  In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits
 # to polynomial coefficients backwards, this zero-extension actually changes
 # the product by introducing an extra factor of x.  Therefore, users of this
 # macro must ensure that one of the inputs has an extra factor of x^-1, i.e.
 # the multiplicative inverse of x, to cancel out the extra x.
 #
 # Third, the backwards coefficients convention is just confusing to work with,
 # since it makes "low" and "high" in the polynomial math mean the opposite of
 # their normal meaning in computer programming.  This can be solved by using an
 # alternative interpretation: the polynomial coefficients are understood to be
 # in the natural order, and the multiplication is actually \a * \b * x^-128 mod
 # x^128 + x^127 + x^126 + x^121 + 1.  This doesn't change the inputs, outputs,
 # or the implementation at all; it just changes the mathematical interpretation
 # of what each instruction is doing.  Starting from here, we'll use this
 # alternative interpretation, as it's easier to understand the code that way.
 #
 # Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 =>
 # 128-bit carryless multiplication, so we break the 128 x 128 multiplication
 # into parts as follows (the _L and _H suffixes denote low and high 64 bits):
 #
 #     LO = a_L * b_L
 #     MI = (a_L * b_H) + (a_H * b_L)
 #     HI = a_H * b_H
 #
 # The 256-bit product is x^128*HI + x^64*MI + LO.  LO, MI, and HI are 128-bit.
 # Note that MI "overlaps" with LO and HI.  We don't consolidate MI into LO and
 # HI right away, since the way the reduction works makes that unnecessary.
 #
 # For the reduction, we cancel out the low 128 bits by adding multiples of G =
 # x^128 + x^127 + x^126 + x^121 + 1.  This is done by two iterations, each of
 # which cancels out the next lowest 64 bits.  Consider a value x^64*A + B,
 # where A and B are 128-bit.  Adding B_L*G to that value gives:
 #
 #       x^64*A + B + B_L*G
 #     = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1)
 #     = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L
 #     = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L
 #     = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57))
 #
 # So: if we sum A, B with its halves swapped, and the low half of B times x^63
 # + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
 # original value x^64*A + B.  I.e., the low 64 bits got canceled out.
 #
 # We just need to apply this twice: first to fold LO into MI, and second to
 # fold the updated MI into HI.
 #
 # The needed three-argument XORs are done using the vpternlogd instruction with
 # immediate 0x96, since this is faster than two vpxord instructions.
 #
 # A potential optimization, assuming that b is fixed per-key (if a is fixed
 # per-key it would work the other way around), is to use one iteration of the
 # reduction described above to precompute a value c such that x^64*c = b mod G,
 # and then multiply a_L by c (and implicitly by x^64) instead of by b:
 #
 #     MI = (a_L * c_L) + (a_H * b_L)
 #     HI = (a_L * c_H) + (a_H * b_H)
 #
 # This would eliminate the LO part of the intermediate product, which would
 # eliminate the need to fold LO into MI.  This would save two instructions,
 # including a vpclmulqdq.  However, we currently don't use this optimization
 # because it would require twice as many per-key precomputed values.
 #
 # Using Karatsuba multiplication instead of "schoolbook" multiplication
 # similarly would save a vpclmulqdq but does not seem to be worth it.
 sub _ghash_mul {
     my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
     return <<___;
     vpclmulqdq      \$0x00, $a, $b, $t0        # LO = a_L * b_L
     vpclmulqdq      \$0x01, $a, $b, $t1        # MI_0 = a_L * b_H
     vpclmulqdq      \$0x10, $a, $b, $t2        # MI_1 = a_H * b_L
     vpxord          $t2, $t1, $t1              # MI = MI_0 + MI_1
     vpclmulqdq      \$0x01, $t0, $gfpoly, $t2  # LO_L*(x^63 + x^62 + x^57)
     vpshufd         \$0x4e, $t0, $t0           # Swap halves of LO
     vpternlogd      \$0x96, $t2, $t0, $t1      # Fold LO into MI
     vpclmulqdq      \$0x11, $a, $b, $dst       # HI = a_H * b_H
     vpclmulqdq      \$0x01, $t1, $gfpoly, $t0  # MI_L*(x^63 + x^62 + x^57)
     vpshufd         \$0x4e, $t1, $t1           # Swap halves of MI
     vpternlogd      \$0x96, $t0, $t1, $dst     # Fold MI into HI
 ___
 }

 # GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
 # *unreduced* products to \lo, \mi, and \hi.
 sub _ghash_mul_noreduce {
     my ( $a, $b, $lo, $mi, $hi, $t0, $t1, $t2, $t3 ) = @_;
     return <<___;
     vpclmulqdq      \$0x00, $a, $b, $t0      # a_L * b_L
     vpclmulqdq      \$0x01, $a, $b, $t1      # a_L * b_H
     vpclmulqdq      \$0x10, $a, $b, $t2      # a_H * b_L
     vpclmulqdq      \$0x11, $a, $b, $t3      # a_H * b_H
     vpxord          $t0, $lo, $lo
     vpternlogd      \$0x96, $t2, $t1, $mi
     vpxord          $t3, $hi, $hi
 ___
 }

 # Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
 # reduced products in \hi.  See _ghash_mul for explanation of reduction.
 sub _ghash_reduce {
     my ( $lo, $mi, $hi, $gfpoly, $t0 ) = @_;
     return <<___;
     vpclmulqdq      \$0x01, $lo, $gfpoly, $t0
     vpshufd         \$0x4e, $lo, $lo
     vpternlogd      \$0x96, $t0, $lo, $mi
     vpclmulqdq      \$0x01, $mi, $gfpoly, $t0
     vpshufd         \$0x4e, $mi, $mi
     vpternlogd      \$0x96, $t0, $mi, $hi
 ___
 }

 # This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
 # squares \a.  It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
 sub _ghash_square {
     my ( $a, $dst, $gfpoly, $t0, $t1 ) = @_;
     return <<___;
     vpclmulqdq      \$0x00, $a, $a, $t0        # LO = a_L * a_L
     vpclmulqdq      \$0x11, $a, $a, $dst       # HI = a_H * a_H
     vpclmulqdq      \$0x01, $t0, $gfpoly, $t1  # LO_L*(x^63 + x^62 + x^57)
     vpshufd         \$0x4e, $t0, $t0           # Swap halves of LO
     vpxor           $t0, $t1, $t1              # Fold LO into MI
     vpclmulqdq      \$0x01, $t1, $gfpoly, $t0  # MI_L*(x^63 + x^62 + x^57)
     vpshufd         \$0x4e, $t1, $t1           # Swap halves of MI
     vpternlogd      \$0x96, $t0, $t1, $dst     # Fold MI into HI
 ___
 }

 # void gcm_init_vpclmulqdq_avx512(u128 Htable[16], const uint64_t H[2]);
 #
 # Initialize |Htable| with powers of the GHASH subkey |H|.
 #
 # The powers are stored in the order H^NUM_H_POWERS to H^1.
 $code .= _begin_func "gcm_init_vpclmulqdq_avx512", 0;
 {
     # Function arguments
     my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ];

     # Additional local variables.  %rax is used as a temporary register.
     my ( $TMP0, $TMP0_YMM, $TMP0_XMM ) = ( "%zmm0", "%ymm0", "%xmm0" );
     my ( $TMP1, $TMP1_YMM, $TMP1_XMM ) = ( "%zmm1", "%ymm1", "%xmm1" );
     my ( $TMP2, $TMP2_YMM, $TMP2_XMM ) = ( "%zmm2", "%ymm2", "%xmm2" );
     my $POWERS_PTR     = "%r8";
     my $RNDKEYLAST_PTR = "%r9";
     my ( $H_CUR, $H_CUR_YMM, $H_CUR_XMM )    = ( "%zmm3", "%ymm3", "%xmm3" );
     my ( $H_INC, $H_INC_YMM, $H_INC_XMM )    = ( "%zmm4", "%ymm4", "%xmm4" );
     my ( $GFPOLY, $GFPOLY_YMM, $GFPOLY_XMM ) = ( "%zmm5", "%ymm5", "%xmm5" );

     $code .= <<___;
     # Get pointer to lowest set of key powers (located at end of array).
     lea             $OFFSETOFEND_H_POWERS-64($HTABLE), $POWERS_PTR

     # Load the byte-reflected hash subkey.  BoringSSL provides it in
     # byte-reflected form except the two halves are in the wrong order.
     vpshufd         \$0x4e, ($H_PTR), $H_CUR_XMM

     # Finish preprocessing the first key power, H^1.  Since this GHASH
     # implementation operates directly on values with the backwards bit
     # order specified by the GCM standard, it's necessary to preprocess the
     # raw key as follows.  First, reflect its bytes.  Second, multiply it
     # by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
     # interpretation of polynomial coefficients), which can also be
     # interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121
     # + 1 using the alternative, natural interpretation of polynomial
     # coefficients.  For details, see the comment above _ghash_mul.
     #
     # Either way, for the multiplication the concrete operation performed
     # is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
     # << 120) | 1 if a 1 bit was carried out.  However, there's no 128-bit
     # wide shift instruction, so instead double each of the two 64-bit
     # halves and incorporate the internal carry bit into the value XOR'd.
     vpshufd         \$0xd3, $H_CUR_XMM, $TMP0_XMM
     vpsrad          \$31, $TMP0_XMM, $TMP0_XMM
     vpaddq          $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM
     # H_CUR_XMM ^= TMP0_XMM & gfpoly_and_internal_carrybit
     vpternlogd      \$0x78, .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $H_CUR_XMM

     # Load the gfpoly constant.
     vbroadcasti32x4 .Lgfpoly(%rip), $GFPOLY

     # Square H^1 to get H^2.
     #
     # Note that as with H^1, all higher key powers also need an extra
     # factor of x^-1 (or x using the natural interpretation).  Nothing
     # special needs to be done to make this happen, though: H^1 * H^1 would
     # end up with two factors of x^-1, but the multiplication consumes one.
     # So the product H^2 ends up with the desired one factor of x^-1.
     @{[ _ghash_square  $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM,
                        $TMP0_XMM, $TMP1_XMM ]}

     # Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
     vinserti128     \$1, $H_CUR_XMM, $H_INC_YMM, $H_CUR_YMM
     vinserti128     \$1, $H_INC_XMM, $H_INC_YMM, $H_INC_YMM

     # Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
     @{[ _ghash_mul  $H_INC_YMM, $H_CUR_YMM, $H_INC_YMM, $GFPOLY_YMM,
                     $TMP0_YMM, $TMP1_YMM, $TMP2_YMM ]}
     vinserti64x4    \$1, $H_CUR_YMM, $H_INC, $H_CUR
     vshufi64x2      \$0, $H_INC, $H_INC, $H_INC

     # Store the lowest set of key powers.
     vmovdqu8        $H_CUR, ($POWERS_PTR)

     # Compute and store the remaining key powers.
     # Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
     # [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
     mov             \$3, %eax
 .Lprecompute_next:
     sub             \$64, $POWERS_PTR
     @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
     vmovdqu8        $H_CUR, ($POWERS_PTR)
     dec             %eax
     jnz             .Lprecompute_next

     vzeroupper      # This is needed after using ymm or zmm registers.
 ___
 }
 $code .= _end_func;

 # XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
 # the result in \dst_xmm.  This implicitly zeroizes the other lanes of dst.
 sub _horizontal_xor {
     my ( $src, $src_xmm, $dst_xmm, $t0_xmm, $t1_xmm, $t2_xmm ) = @_;
     return <<___;
     vextracti32x4   \$1, $src, $t0_xmm
     vextracti32x4   \$2, $src, $t1_xmm
     vextracti32x4   \$3, $src, $t2_xmm
     vpxord          $t0_xmm, $src_xmm, $dst_xmm
     vpternlogd      \$0x96, $t1_xmm, $t2_xmm, $dst_xmm
 ___
 }

 # Do one step of the GHASH update of the data blocks given in the vector
 # registers GHASHDATA[0-3].  \i specifies the step to do, 0 through 9.  The
 # division into steps allows users of this macro to optionally interleave the
 # computation with other instructions.  This macro uses the vector register
 # GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered;
 # H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and
 # GHASHTMP[0-2] as temporaries.  This macro handles the byte-reflection of the
 # data blocks.  The parameter registers must be preserved across steps.
 #
 # The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
 # H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
 # operations are vectorized operations on 512-bit vectors of 128-bit blocks.
 # The vectorized terms correspond to the following non-vectorized terms:
 #
 #       H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM),
 #              H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0)
 #       H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7
 #       H_POW2*GHASHDATA2 => H^8*blk8,  H^7*blk9,  H^6*blk10, and H^5*blk11
 #       H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15
 #
 # More concretely, this code does:
 #   - Do vectorized "schoolbook" multiplications to compute the intermediate
 #     256-bit product of each block and its corresponding hash key power.
 #   - Sum (XOR) the intermediate 256-bit products across vectors.
 #   - Do a vectorized reduction of these 256-bit intermediate values to 128-bits
 #     each.
 #   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
 #
 # See _ghash_mul for the full explanation of the operations performed for each
 # individual finite field multiplication and reduction.
 sub _ghash_step_4x {
     my (
         $i,              $BSWAP_MASK,     $GHASHDATA0,     $GHASHDATA1,
         $GHASHDATA2,     $GHASHDATA3,     $GHASHDATA0_XMM, $GHASHDATA1_XMM,
         $GHASHDATA2_XMM, $GHASHDATA3_XMM, $H_POW4,         $H_POW3,
         $H_POW2,         $H_POW1,         $GFPOLY,         $GHASHTMP0,
         $GHASHTMP1,      $GHASHTMP2,      $GHASH_ACC,      $GHASH_ACC_XMM
     ) = @_;
     if ( $i == 0 ) {
         return <<___;
         vpshufb         $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
         vpxord          $GHASH_ACC, $GHASHDATA0, $GHASHDATA0
         vpshufb         $BSWAP_MASK, $GHASHDATA1, $GHASHDATA1
         vpshufb         $BSWAP_MASK, $GHASHDATA2, $GHASHDATA2
 ___
     }
     elsif ( $i == 1 ) {
         return <<___;
         vpshufb         $BSWAP_MASK, $GHASHDATA3, $GHASHDATA3
         vpclmulqdq      \$0x00, $H_POW4, $GHASHDATA0, $GHASH_ACC    # LO_0
         vpclmulqdq      \$0x00, $H_POW3, $GHASHDATA1, $GHASHTMP0    # LO_1
         vpclmulqdq      \$0x00, $H_POW2, $GHASHDATA2, $GHASHTMP1    # LO_2
 ___
     }
     elsif ( $i == 2 ) {
         return <<___;
         vpxord          $GHASHTMP0, $GHASH_ACC, $GHASH_ACC          # sum(LO_{1,0})
         vpclmulqdq      \$0x00, $H_POW1, $GHASHDATA3, $GHASHTMP2    # LO_3
         vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASH_ACC  # LO = sum(LO_{3,2,1,0})
         vpclmulqdq      \$0x01, $H_POW4, $GHASHDATA0, $GHASHTMP0    # MI_0
 ___
     }
     elsif ( $i == 3 ) {
         return <<___;
         vpclmulqdq      \$0x01, $H_POW3, $GHASHDATA1, $GHASHTMP1    # MI_1
         vpclmulqdq      \$0x01, $H_POW2, $GHASHDATA2, $GHASHTMP2    # MI_2
         vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0  # sum(MI_{2,1,0})
         vpclmulqdq      \$0x01, $H_POW1, $GHASHDATA3, $GHASHTMP1    # MI_3
 ___
     }
     elsif ( $i == 4 ) {
         return <<___;
         vpclmulqdq      \$0x10, $H_POW4, $GHASHDATA0, $GHASHTMP2    # MI_4
         vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0  # sum(MI_{4,3,2,1,0})
         vpclmulqdq      \$0x10, $H_POW3, $GHASHDATA1, $GHASHTMP1    # MI_5
         vpclmulqdq      \$0x10, $H_POW2, $GHASHDATA2, $GHASHTMP2    # MI_6
 ___
     }
     elsif ( $i == 5 ) {
         return <<___;
         vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0  # sum(MI_{6,5,4,3,2,1,0})
         vpclmulqdq      \$0x01, $GHASH_ACC, $GFPOLY, $GHASHTMP2     # LO_L*(x^63 + x^62 + x^57)
         vpclmulqdq      \$0x10, $H_POW1, $GHASHDATA3, $GHASHTMP1    # MI_7
         vpxord          $GHASHTMP1, $GHASHTMP0, $GHASHTMP0          # MI = sum(MI_{7,6,5,4,3,2,1,0})
 ___
     }
     elsif ( $i == 6 ) {
         return <<___;
         vpshufd         \$0x4e, $GHASH_ACC, $GHASH_ACC              # Swap halves of LO
         vpclmulqdq      \$0x11, $H_POW4, $GHASHDATA0, $GHASHDATA0   # HI_0
         vpclmulqdq      \$0x11, $H_POW3, $GHASHDATA1, $GHASHDATA1   # HI_1
         vpclmulqdq      \$0x11, $H_POW2, $GHASHDATA2, $GHASHDATA2   # HI_2
 ___
     }
     elsif ( $i == 7 ) {
         return <<___;
         vpternlogd      \$0x96, $GHASHTMP2, $GHASH_ACC, $GHASHTMP0  # Fold LO into MI
         vpclmulqdq      \$0x11, $H_POW1, $GHASHDATA3, $GHASHDATA3   # HI_3
         vpternlogd      \$0x96, $GHASHDATA2, $GHASHDATA1, $GHASHDATA0 # sum(HI_{2,1,0})
         vpclmulqdq      \$0x01, $GHASHTMP0, $GFPOLY, $GHASHTMP1     # MI_L*(x^63 + x^62 + x^57)
 ___
     }
     elsif ( $i == 8 ) {
         return <<___;
         vpxord          $GHASHDATA3, $GHASHDATA0, $GHASH_ACC        # HI = sum(HI_{3,2,1,0})
         vpshufd         \$0x4e, $GHASHTMP0, $GHASHTMP0              # Swap halves of MI
         vpternlogd      \$0x96, $GHASHTMP1, $GHASHTMP0, $GHASH_ACC  # Fold MI into HI
 ___
     }
     elsif ( $i == 9 ) {
         return _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
           $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM;
     }
 }

 # Update GHASH with four vectors of data blocks.  See _ghash_step_4x for full
 # explanation.
 sub _ghash_4x {
     my $code = "";
     for my $i ( 0 .. 9 ) {
         $code .= _ghash_step_4x $i, @_;
     }
     return $code;
 }

 # void gcm_gmult_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16]);
 $code .= _begin_func "gcm_gmult_vpclmulqdq_avx512", 1;
 {
     my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ];
     my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
       map( "%xmm$_", ( 0 .. 6 ) );

     $code .= <<___;
     @{[ _save_xmmregs (6) ]}
     .seh_endprologue

     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC
     vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK
     vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
     vmovdqu         .Lgfpoly(%rip), $GFPOLY
     vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC

     @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}

     vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
     vmovdqu         $GHASH_ACC, ($GHASH_ACC_PTR)

     # No need for vzeroupper, since only xmm registers were used.
 ___
 }
 $code .= _end_func;

 # void gcm_ghash_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16],
 #                                  const uint8_t *in, size_t len);
 #
 # Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
 # by |in| and |len|.  |len| must be a multiple of 16.
 #
 # This function handles large amounts of AAD efficiently, while also keeping the
 # overhead low for small amounts of AAD which is the common case.  TLS uses less
 # than one block of AAD, but (uncommonly) other use cases may use much more.
 $code .= _begin_func "gcm_ghash_vpclmulqdq_avx512", 1;
 {
     # Function arguments
     my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];

     # Additional local variables
     my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( "%zmm0", "%xmm0" );
     my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( "%zmm1", "%xmm1" );
     my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( "%zmm2", "%xmm2" );
     my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( "%zmm3", "%xmm3" );
     my @GHASHDATA = ( $GHASHDATA0, $GHASHDATA1, $GHASHDATA2, $GHASHDATA3 );
     my @GHASHDATA_XMM =
       ( $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM, $GHASHDATA3_XMM );
     my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%zmm4", "%xmm4" );
     my ( $GHASH_ACC, $GHASH_ACC_XMM )   = ( "%zmm5", "%xmm5" );
     my ( $H_POW4, $H_POW3, $H_POW2 )    = ( "%zmm6", "%zmm7", "%zmm8" );
     my ( $H_POW1, $H_POW1_XMM )         = ( "%zmm9", "%xmm9" );
     my ( $GFPOLY, $GFPOLY_XMM )         = ( "%zmm10", "%xmm10" );
     my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) =
       ( "%zmm11", "%zmm12", "%zmm13" );

     $code .= <<___;
     @{[ _save_xmmregs (6 .. 13) ]}
     .seh_endprologue

     # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small,
     # usually only 128-bit vectors will be used.  So as an optimization, don't
     # broadcast these constants to all 128-bit lanes quite yet.
     vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK_XMM
     vmovdqu         .Lgfpoly(%rip), $GFPOLY_XMM

     # Load the GHASH accumulator.
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM

     # Optimize for AADLEN < 64 by checking for AADLEN < 64 before AADLEN < 256.
     cmp             \$64, $AADLEN
     jb              .Laad_blockbyblock

     # AADLEN >= 64, so we'll operate on full vectors.  Broadcast bswap_mask and
     # gfpoly to all 128-bit lanes.
     vshufi64x2      \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
     vshufi64x2      \$0, $GFPOLY, $GFPOLY, $GFPOLY

     # Load the lowest set of key powers.
     vmovdqu8        $OFFSETOFEND_H_POWERS-1*64($HTABLE), $H_POW1

     cmp             \$256, $AADLEN
     jb              .Laad_loop_1x

     # AADLEN >= 256.  Load the higher key powers.
     vmovdqu8        $OFFSETOFEND_H_POWERS-4*64($HTABLE), $H_POW4
     vmovdqu8        $OFFSETOFEND_H_POWERS-3*64($HTABLE), $H_POW3
     vmovdqu8        $OFFSETOFEND_H_POWERS-2*64($HTABLE), $H_POW2

     # Update GHASH with 256 bytes of AAD at a time.
 .Laad_loop_4x:
     vmovdqu8        0*64($AAD), $GHASHDATA0
     vmovdqu8        1*64($AAD), $GHASHDATA1
     vmovdqu8        2*64($AAD), $GHASHDATA2
     vmovdqu8        3*64($AAD), $GHASHDATA3
     @{[ _ghash_4x   $BSWAP_MASK, @GHASHDATA, @GHASHDATA_XMM, $H_POW4, $H_POW3,
                     $H_POW2, $H_POW1, $GFPOLY, $GHASHTMP0, $GHASHTMP1,
                     $GHASHTMP2, $GHASH_ACC, $GHASH_ACC_XMM ]}
     add             \$256, $AAD
     sub             \$256, $AADLEN
     cmp             \$256, $AADLEN
     jae             .Laad_loop_4x

     # Update GHASH with 64 bytes of AAD at a time.
     cmp             \$64, $AADLEN
     jb              .Laad_large_done
 .Laad_loop_1x:
     vmovdqu8        ($AAD), $GHASHDATA0
     vpshufb         $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
     vpxord          $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
     @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY,
                     $GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
     @{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
                         $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
     add             \$64, $AAD
     sub             \$64, $AADLEN
     cmp             \$64, $AADLEN
     jae             .Laad_loop_1x

 .Laad_large_done:

     # GHASH the remaining data 16 bytes at a time, using xmm registers only.
 .Laad_blockbyblock:
     test            $AADLEN, $AADLEN
     jz              .Laad_done
     vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1_XMM
 .Laad_loop_blockbyblock:
     vmovdqu         ($AAD), $GHASHDATA0_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM
     vpxor           $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     @{[ _ghash_mul  $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
                     $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
     add             \$16, $AAD
     sub             \$16, $AADLEN
     jnz             .Laad_loop_blockbyblock

 .Laad_done:
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)

     vzeroupper      # This is needed after using ymm or zmm registers.
 ___
 }
 $code .= _end_func;

 # Do one non-last round of AES encryption on the counter blocks in aesdata[0-3]
 # using the round key that has been broadcast to all 128-bit lanes of round_key.
 sub _vaesenc_4x {
     my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_;
     return <<___;
     vaesenc         $round_key, $aesdata0, $aesdata0
     vaesenc         $round_key, $aesdata1, $aesdata1
     vaesenc         $round_key, $aesdata2, $aesdata2
     vaesenc         $round_key, $aesdata3, $aesdata3
 ___
 }

 # Start the AES encryption of four vectors of counter blocks.
 sub _ctr_begin_4x {
     my (
         $le_ctr,   $le_ctr_inc, $bswap_mask, $rndkey0,
         $aesdata0, $aesdata1,   $aesdata2,   $aesdata3
     ) = @_;
     return <<___;
     # Increment le_ctr four times to generate four vectors of little-endian
     # counter blocks, swap each to big-endian, and store them in aesdata[0-3].
     vpshufb         $bswap_mask, $le_ctr, $aesdata0
     vpaddd          $le_ctr_inc, $le_ctr, $le_ctr
     vpshufb         $bswap_mask, $le_ctr, $aesdata1
     vpaddd          $le_ctr_inc, $le_ctr, $le_ctr
     vpshufb         $bswap_mask, $le_ctr, $aesdata2
     vpaddd          $le_ctr_inc, $le_ctr, $le_ctr
     vpshufb         $bswap_mask, $le_ctr, $aesdata3
     vpaddd          $le_ctr_inc, $le_ctr, $le_ctr

     # AES "round zero": XOR in the zero-th round key.
     vpxord          $rndkey0, $aesdata0, $aesdata0
     vpxord          $rndkey0, $aesdata1, $aesdata1
     vpxord          $rndkey0, $aesdata2, $aesdata2
     vpxord          $rndkey0, $aesdata3, $aesdata3
 ___
 }

 # Do the last AES round for four vectors of counter blocks, XOR four vectors of
 # source data with the resulting keystream blocks, and write the result to the
 # destination buffer and ghashdata[0-3].  The implementation differs slightly as
 # it takes advantage of the property vaesenclast(key, a) ^ b ==
 # vaesenclast(key ^ b, a) to reduce latency, but it has the same effect.
 sub _aesenclast_and_xor_4x {
     my (
         $src,        $dst,        $rndkeylast, $aesdata0,
         $aesdata1,   $aesdata2,   $aesdata3,   $ghashdata0,
         $ghashdata1, $ghashdata2, $ghashdata3
     ) = @_;
     return <<___;
     vpxord          0*64($src), $rndkeylast, $ghashdata0
     vpxord          1*64($src), $rndkeylast, $ghashdata1
     vpxord          2*64($src), $rndkeylast, $ghashdata2
     vpxord          3*64($src), $rndkeylast, $ghashdata3
     vaesenclast     $ghashdata0, $aesdata0, $ghashdata0
     vaesenclast     $ghashdata1, $aesdata1, $ghashdata1
     vaesenclast     $ghashdata2, $aesdata2, $ghashdata2
     vaesenclast     $ghashdata3, $aesdata3, $ghashdata3
     vmovdqu8        $ghashdata0, 0*64($dst)
     vmovdqu8        $ghashdata1, 1*64($dst)
     vmovdqu8        $ghashdata2, 2*64($dst)
     vmovdqu8        $ghashdata3, 3*64($dst)
 ___
 }

 my $g_update_macro_expansion_count = 0;

 # void aes_gcm_{enc,dec}_update_vaes_avx512(const uint8_t *in, uint8_t *out,
 #                                           size_t len, const AES_KEY *key,
 #                                           const uint8_t ivec[16],
 #                                           const u128 Htable[16],
 #                                           uint8_t Xi[16]);
 #
 # This macro generates a GCM encryption or decryption update function with the
 # above prototype (with \enc selecting which one).  The function computes the
 # next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and
 # writes the resulting encrypted or decrypted data to |out|.  It also updates
 # the GHASH accumulator |Xi| using the next |len| ciphertext bytes.
 #
 # |len| must be a multiple of 16, except on the last call where it can be any
 # length.  The caller must do any buffering needed to ensure this.  Both
 # in-place and out-of-place en/decryption are supported.
 #
 # |ivec| must give the current counter in big-endian format.  This function
 # loads the counter from |ivec| and increments the loaded counter as needed, but
 # it does *not* store the updated counter back to |ivec|.  The caller must
 # update |ivec| if any more data segments follow.  Internally, only the low
 # 32-bit word of the counter is incremented, following the GCM standard.
 sub _aes_gcm_update {
     my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count;
     my ($enc)              = @_;
     my $code               = "";

     # Function arguments
     my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR )
       = $win64
       ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" )
       : ( @argregs[ 0 .. 5 ], "%r12" );

     # Additional local variables.
     # %rax, %k1, and %k2 are used as temporary registers.  BE_CTR_PTR is
     # also available as a temporary register after the counter is loaded.

     # AES key length in bytes
     my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" );

     # Pointer to the last AES round key for the chosen AES variant
     my $RNDKEYLAST_PTR = "%r11";

     # AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
     my ( $AESDATA0, $AESDATA0_XMM ) = ( "%zmm0", "%xmm0" );
     my ( $AESDATA1, $AESDATA1_XMM ) = ( "%zmm1", "%xmm1" );
     my ( $AESDATA2, $AESDATA2_XMM ) = ( "%zmm2", "%xmm2" );
     my ( $AESDATA3, $AESDATA3_XMM ) = ( "%zmm3", "%xmm3" );
     my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 );

     # GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
     my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( "%zmm4", "%xmm4" );
     my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( "%zmm5", "%xmm5" );
     my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( "%zmm6", "%xmm6" );
     my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( "%zmm7", "%xmm7" );
     my @GHASHDATA = ( $GHASHDATA0, $GHASHDATA1, $GHASHDATA2, $GHASHDATA3 );
     my @GHASHDATA_XMM =
       ( $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM, $GHASHDATA3_XMM );

     # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
     # using vpshufb, copied to all 128-bit lanes.
     my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%zmm8", "%xmm8" );

     # RNDKEY temporarily holds the next AES round key.
     my $RNDKEY = "%zmm9";

     # GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
     # only the lowest 128-bit lane can be nonzero.  When not fully reduced,
     # more than one lane may be used, and they need to be XOR'd together.
     my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%zmm10", "%xmm10" );

     # LE_CTR_INC is the vector of 32-bit words that need to be added to a
     # vector of little-endian counter blocks to advance it forwards.
     my $LE_CTR_INC = "%zmm11";

     # LE_CTR contains the next set of little-endian counter blocks.
     my $LE_CTR = "%zmm12";

     # RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
     # copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
     # RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
     my (
         $RNDKEY0,   $RNDKEYLAST, $RNDKEY_M9, $RNDKEY_M8,
         $RNDKEY_M7, $RNDKEY_M6,  $RNDKEY_M5, $RNDKEY_M4,
         $RNDKEY_M3, $RNDKEY_M2,  $RNDKEY_M1
       )
       = (
         "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18",
         "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23"
       );

     # GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
     # cannot coincide with anything used for AES encryption, since for
     # performance reasons GHASH and AES encryption are interleaved.
     my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) =
       ( "%zmm24", "%zmm25", "%zmm26" );

     # H_POW[4-1] contain the powers of the hash key H^16...H^1.  The descending
     # numbering reflects the order of the key powers.
     my ( $H_POW4, $H_POW3, $H_POW2, $H_POW1 ) =
       ( "%zmm27", "%zmm28", "%zmm29", "%zmm30" );

     # GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
     my $GFPOLY = "%zmm31";

     my @ghash_4x_args = (
         $BSWAP_MASK, @GHASHDATA, @GHASHDATA_XMM, $H_POW4,
         $H_POW3,     $H_POW2,    $H_POW1,        $GFPOLY,
         $GHASHTMP0,  $GHASHTMP1, $GHASHTMP2,     $GHASH_ACC,
         $GHASH_ACC_XMM
     );

     if ($win64) {
         $code .= <<___;
         @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]}
         mov             64(%rsp), $BE_CTR_PTR     # arg5
         mov             72(%rsp), $HTABLE         # arg6
         mov             80(%rsp), $GHASH_ACC_PTR  # arg7
         @{[ _save_xmmregs (6 .. 15) ]}
         .seh_endprologue
 ___
     }
     else {
         $code .= <<___;
         @{[ _save_gpregs $GHASH_ACC_PTR ]}
         mov             16(%rsp), $GHASH_ACC_PTR  # arg7
 ___
     }

     if ($enc) {
         $code .= <<___;
 #ifdef BORINGSSL_DISPATCH_TEST
         .extern BORINGSSL_function_hit
         movb \$1,BORINGSSL_function_hit+7(%rip)
 #endif
 ___
     }
     $code .= <<___;
     # Load some constants.
     vbroadcasti32x4 .Lbswap_mask(%rip), $BSWAP_MASK
     vbroadcasti32x4 .Lgfpoly(%rip), $GFPOLY

     # Load the GHASH accumulator and the starting counter.
     # BoringSSL passes these values in big endian format.
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vbroadcasti32x4 ($BE_CTR_PTR), $LE_CTR
     vpshufb         $BSWAP_MASK, $LE_CTR, $LE_CTR

     # Load the AES key length in bytes.  BoringSSL stores number of rounds
     # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20.
     movl            $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN
     lea             -20(,$AESKEYLEN,4), $AESKEYLEN

     # Make RNDKEYLAST_PTR point to the last AES round key.  This is the
     # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
     # respectively.  Then load the zero-th and last round keys.
     lea             6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR
     vbroadcasti32x4 ($AESKEY), $RNDKEY0
     vbroadcasti32x4 ($RNDKEYLAST_PTR), $RNDKEYLAST

     # Finish initializing LE_CTR by adding [0, 1, 2, 3] to its low words.
     vpaddd          .Lctr_pattern(%rip), $LE_CTR, $LE_CTR

     # Load 4 into all 128-bit lanes of LE_CTR_INC.
     vbroadcasti32x4 .Linc_4blocks(%rip), $LE_CTR_INC

     # If there are at least 256 bytes of data, then continue into the loop
     # that processes 256 bytes of data at a time.  Otherwise skip it.
     cmp             \$256, $DATALEN
     jb              .Lcrypt_loop_4x_done$local_label_suffix

     # Load powers of the hash key.
     vmovdqu8        $OFFSETOFEND_H_POWERS-4*64($HTABLE), $H_POW4
     vmovdqu8        $OFFSETOFEND_H_POWERS-3*64($HTABLE), $H_POW3
     vmovdqu8        $OFFSETOFEND_H_POWERS-2*64($HTABLE), $H_POW2
     vmovdqu8        $OFFSETOFEND_H_POWERS-1*64($HTABLE), $H_POW1
 ___

     # Main loop: en/decrypt and hash 4 vectors at a time.
     #
     # When possible, interleave the AES encryption of the counter blocks
     # with the GHASH update of the ciphertext blocks.  This improves
     # performance on many CPUs because the execution ports used by the VAES
     # instructions often differ from those used by vpclmulqdq and other
     # instructions used in GHASH.  For example, many Intel CPUs dispatch
     # vaesenc to ports 0 and 1 and vpclmulqdq to port 5.
     #
     # The interleaving is easiest to do during decryption, since during
     # decryption the ciphertext blocks are immediately available.  For
     # encryption, instead encrypt the first set of blocks, then hash those
     # blocks while encrypting the next set of blocks, repeat that as
     # needed, and finally hash the last set of blocks.

     if ($enc) {
         $code .= <<___;
         # Encrypt the first 4 vectors of plaintext blocks.  Leave the resulting
         # ciphertext in GHASHDATA[0-3] for GHASH.
         @{[ _ctr_begin_4x $LE_CTR, $LE_CTR_INC, $BSWAP_MASK, $RNDKEY0, @AESDATA ]}
         lea             16($AESKEY), %rax
 .Lvaesenc_loop_first_4_vecs$local_label_suffix:
         vbroadcasti32x4 (%rax), $RNDKEY
         @{[ _vaesenc_4x $RNDKEY, @AESDATA ]}
         add             \$16, %rax
         cmp             %rax, $RNDKEYLAST_PTR
         jne             .Lvaesenc_loop_first_4_vecs$local_label_suffix
         @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, @GHASHDATA ]}
         add             \$256, $SRC
         add             \$256, $DST
         sub             \$256, $DATALEN
         cmp             \$256, $DATALEN
         jb              .Lghash_last_ciphertext_4x$local_label_suffix
 ___
     }

     $code .= <<___;
     # Cache as many additional AES round keys as possible.
     vbroadcasti32x4 -9*16($RNDKEYLAST_PTR), $RNDKEY_M9
     vbroadcasti32x4 -8*16($RNDKEYLAST_PTR), $RNDKEY_M8
     vbroadcasti32x4 -7*16($RNDKEYLAST_PTR), $RNDKEY_M7
     vbroadcasti32x4 -6*16($RNDKEYLAST_PTR), $RNDKEY_M6
     vbroadcasti32x4 -5*16($RNDKEYLAST_PTR), $RNDKEY_M5
     vbroadcasti32x4 -4*16($RNDKEYLAST_PTR), $RNDKEY_M4
     vbroadcasti32x4 -3*16($RNDKEYLAST_PTR), $RNDKEY_M3
     vbroadcasti32x4 -2*16($RNDKEYLAST_PTR), $RNDKEY_M2
     vbroadcasti32x4 -1*16($RNDKEYLAST_PTR), $RNDKEY_M1

 .Lcrypt_loop_4x$local_label_suffix:
 ___

     # If decrypting, load more ciphertext blocks into GHASHDATA[0-3].  If
     # encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
     if ( !$enc ) {
         $code .= <<___;
         vmovdqu8        0*64($SRC), $GHASHDATA0
         vmovdqu8        1*64($SRC), $GHASHDATA1
         vmovdqu8        2*64($SRC), $GHASHDATA2
         vmovdqu8        3*64($SRC), $GHASHDATA3
 ___
     }

     $code .= <<___;
     # Start the AES encryption of the counter blocks.
     @{[ _ctr_begin_4x $LE_CTR, $LE_CTR_INC, $BSWAP_MASK, $RNDKEY0, @AESDATA ]}
     cmp             \$24, $AESKEYLEN
     jl              .Laes128$local_label_suffix
     je              .Laes192$local_label_suffix
     # AES-256
     vbroadcasti32x4 -13*16($RNDKEYLAST_PTR), $RNDKEY
     @{[ _vaesenc_4x $RNDKEY, @AESDATA ]}
     vbroadcasti32x4 -12*16($RNDKEYLAST_PTR), $RNDKEY
     @{[ _vaesenc_4x $RNDKEY, @AESDATA ]}
 .Laes192$local_label_suffix:
     vbroadcasti32x4 -11*16($RNDKEYLAST_PTR), $RNDKEY
     @{[ _vaesenc_4x $RNDKEY, @AESDATA ]}
     vbroadcasti32x4 -10*16($RNDKEYLAST_PTR), $RNDKEY
     @{[ _vaesenc_4x $RNDKEY, @AESDATA ]}
 .Laes128$local_label_suffix:

     # Prefetch the source data 512 bytes ahead into the L1 data cache, to
     # improve performance when the hardware prefetcher is disabled.  Assumes the
     # L1 data cache line size is 64 bytes (de facto standard on x86_64).
     prefetcht0      512+0*64($SRC)
     prefetcht0      512+1*64($SRC)
     prefetcht0      512+2*64($SRC)
     prefetcht0      512+3*64($SRC)

     # Finish the AES encryption of the counter blocks in AESDATA[0-3],
     # interleaved with the GHASH update of the ciphertext blocks in
     # GHASHDATA[0-3].
     @{[ _ghash_step_4x  0, @ghash_4x_args ]}
     @{[ _vaesenc_4x     $RNDKEY_M9, @AESDATA ]}
     @{[ _ghash_step_4x  1, @ghash_4x_args ]}
     @{[ _vaesenc_4x     $RNDKEY_M8, @AESDATA ]}
     @{[ _ghash_step_4x  2, @ghash_4x_args ]}
     @{[ _vaesenc_4x     $RNDKEY_M7, @AESDATA ]}
     @{[ _ghash_step_4x  3, @ghash_4x_args ]}
     @{[ _vaesenc_4x     $RNDKEY_M6, @AESDATA ]}
     @{[ _ghash_step_4x  4, @ghash_4x_args ]}
     @{[ _vaesenc_4x     $RNDKEY_M5, @AESDATA ]}
     @{[ _ghash_step_4x  5, @ghash_4x_args ]}
     @{[ _vaesenc_4x     $RNDKEY_M4, @AESDATA ]}
     @{[ _ghash_step_4x  6, @ghash_4x_args ]}
     @{[ _vaesenc_4x     $RNDKEY_M3, @AESDATA ]}
     @{[ _ghash_step_4x  7, @ghash_4x_args ]}
     @{[ _vaesenc_4x     $RNDKEY_M2, @AESDATA ]}
     @{[ _ghash_step_4x  8, @ghash_4x_args ]}
     @{[ _vaesenc_4x     $RNDKEY_M1, @AESDATA ]}

     @{[ _ghash_step_4x  9, @ghash_4x_args ]}
     @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, @GHASHDATA ]}
     add             \$256, $SRC
     add             \$256, $DST
     sub             \$256, $DATALEN
     cmp             \$256, $DATALEN
     jae             .Lcrypt_loop_4x$local_label_suffix
 ___

     if ($enc) {

         # Update GHASH with the last set of ciphertext blocks.
         $code .= <<___;
 .Lghash_last_ciphertext_4x$local_label_suffix:
         @{[ _ghash_4x @ghash_4x_args ]}
 ___
     }

     my $POWERS_PTR = $BE_CTR_PTR;    # BE_CTR_PTR is free to be reused.

     $code .= <<___;
 .Lcrypt_loop_4x_done$local_label_suffix:
     # Check whether any data remains.
     test            $DATALEN, $DATALEN
     jz              .Ldone$local_label_suffix

     # The data length isn't a multiple of 256 bytes.  Process the remaining
     # data of length 1 <= DATALEN < 256, up to one 64-byte vector at a time.
     # Going one vector at a time may seem inefficient compared to having
     # separate code paths for each possible number of vectors remaining.
     # However, using a loop keeps the code size down, and it performs
     # surprising well; modern CPUs will start executing the next iteration
     # before the previous one finishes and also predict the number of loop
     # iterations.  For a similar reason, we roll up the AES rounds.
     #
     # On the last iteration, the remaining length may be less than 64 bytes.
     # Handle this using masking.
     #
     # Since there are enough key powers available for all remaining data,
     # there is no need to do a GHASH reduction after each iteration.
     # Instead, multiply each remaining block by its own key power, and only
     # do a GHASH reduction at the very end.

     # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
     # is the number of blocks that remain.
     mov             $DATALEN, %rax
     neg             %rax
     and             \$-16, %rax  # -round_up(DATALEN, 16)
     lea             $OFFSETOFEND_H_POWERS($HTABLE,%rax), $POWERS_PTR
 ___

     # Start collecting the unreduced GHASH intermediate value LO, MI, HI.
     my ( $LO, $LO_XMM ) = ( $GHASHDATA0, $GHASHDATA0_XMM );
     my ( $MI, $MI_XMM ) = ( $GHASHDATA1, $GHASHDATA1_XMM );
     my ( $HI, $HI_XMM ) = ( $GHASHDATA2, $GHASHDATA2_XMM );
     $code .= <<___;
     vpxor           $LO_XMM, $LO_XMM, $LO_XMM
     vpxor           $MI_XMM, $MI_XMM, $MI_XMM
     vpxor           $HI_XMM, $HI_XMM, $HI_XMM

     cmp             \$64, $DATALEN
     jb              .Lpartial_vec$local_label_suffix

 .Lcrypt_loop_1x$local_label_suffix:
     # Process a full 64-byte vector.

     # Encrypt a vector of counter blocks.
     vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA0
     vpaddd          $LE_CTR_INC, $LE_CTR, $LE_CTR
     vpxord          $RNDKEY0, $AESDATA0, $AESDATA0
     lea             16($AESKEY), %rax
 .Lvaesenc_loop_tail_full_vec$local_label_suffix:
     vbroadcasti32x4 (%rax), $RNDKEY
     vaesenc         $RNDKEY, $AESDATA0, $AESDATA0
     add             \$16, %rax
     cmp             %rax, $RNDKEYLAST_PTR
     jne             .Lvaesenc_loop_tail_full_vec$local_label_suffix
     vaesenclast     $RNDKEYLAST, $AESDATA0, $AESDATA0

     # XOR the data with the vector of keystream blocks.
     vmovdqu8        ($SRC), $AESDATA1
     vpxord          $AESDATA1, $AESDATA0, $AESDATA0
     vmovdqu8        $AESDATA0, ($DST)

     # Update GHASH with the ciphertext blocks, without reducing.
     vmovdqu8        ($POWERS_PTR), $H_POW1
     vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $AESDATA1 ]}, $AESDATA0
     vpxord          $GHASH_ACC, $AESDATA0, $AESDATA0
     @{[ _ghash_mul_noreduce $H_POW1, $AESDATA0, $LO, $MI, $HI,
                             $GHASHDATA3, $AESDATA1, $AESDATA2, $AESDATA3 ]}
     vpxor           $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM

     add             \$64, $POWERS_PTR
     add             \$64, $SRC
     add             \$64, $DST
     sub             \$64, $DATALEN
     cmp             \$64, $DATALEN
     jae             .Lcrypt_loop_1x$local_label_suffix

     test            $DATALEN, $DATALEN
     jz              .Lreduce$local_label_suffix

 .Lpartial_vec$local_label_suffix:
     # Process a partial vector of length 1 <= DATALEN < 64.

     # Set the data mask %k1 to DATALEN 1's.
     # Set the key powers mask %k2 to round_up(DATALEN, 16) 1's.
     mov             \$-1, %rax
     bzhi            $DATALEN, %rax, %rax
     kmovq           %rax, %k1
     add             \$15, $DATALEN
     and             \$-16, $DATALEN
     mov             \$-1, %rax
     bzhi            $DATALEN, %rax, %rax
     kmovq           %rax, %k2

     # Encrypt one last vector of counter blocks.  This does not need to be
     # masked.  The counter does not need to be incremented here.
     vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA0
     vpxord          $RNDKEY0, $AESDATA0, $AESDATA0
     lea             16($AESKEY), %rax
 .Lvaesenc_loop_tail_partialvec$local_label_suffix:
     vbroadcasti32x4 (%rax), $RNDKEY
     vaesenc         $RNDKEY, $AESDATA0, $AESDATA0
     add             \$16, %rax
     cmp             %rax, $RNDKEYLAST_PTR
     jne             .Lvaesenc_loop_tail_partialvec$local_label_suffix
     vaesenclast     $RNDKEYLAST, $AESDATA0, $AESDATA0

     # XOR the data with the appropriate number of keystream bytes.
     vmovdqu8        ($SRC), $AESDATA1\{%k1}{z}
     vpxord          $AESDATA1, $AESDATA0, $AESDATA0
     vmovdqu8        $AESDATA0, ($DST){%k1}

     # Update GHASH with the ciphertext block(s), without reducing.
     #
     # In the case of DATALEN < 64, the ciphertext is zero-padded to 64
     # bytes.  (If decrypting, it's done by the above masked load.  If
     # encrypting, it's done by the below masked register-to-register move.)
     # Note that if DATALEN <= 48, there will be additional padding beyond
     # the padding of the last block specified by GHASH itself; i.e., there
     # may be whole block(s) that get processed by the GHASH multiplication
     # and reduction instructions but should not actually be included in the
     # GHASH.  However, any such blocks are all-zeroes, and the values that
     # they're multiplied with are also all-zeroes.  Therefore they just add
     # 0 * 0 = 0 to the final GHASH result, which makes no difference.
     vmovdqu8        ($POWERS_PTR), $H_POW1\{%k2}{z}
     @{[ $enc ? "vmovdqu8 $AESDATA0, $AESDATA1\{%k1}{z}" : "" ]}
     vpshufb         $BSWAP_MASK, $AESDATA1, $AESDATA0
     vpxord          $GHASH_ACC, $AESDATA0, $AESDATA0
     @{[ _ghash_mul_noreduce $H_POW1, $AESDATA0, $LO, $MI, $HI,
                             $GHASHDATA3, $AESDATA1, $AESDATA2, $AESDATA3 ]}

 .Lreduce$local_label_suffix:
     # Finally, do the GHASH reduction.
     @{[ _ghash_reduce   $LO, $MI, $HI, $GFPOLY, $AESDATA0 ]}
     @{[ _horizontal_xor $HI, $HI_XMM, $GHASH_ACC_XMM,
                         $AESDATA0_XMM, $AESDATA1_XMM, $AESDATA2_XMM ]}

 .Ldone$local_label_suffix:
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)

     vzeroupper      # This is needed after using ymm or zmm registers.
 ___
     return $code;
 }

 $code .= _begin_func "aes_gcm_enc_update_vaes_avx512", 1;
 $code .= _aes_gcm_update 1;
 $code .= _end_func;

 $code .= _begin_func "aes_gcm_dec_update_vaes_avx512", 1;
 $code .= _aes_gcm_update 0;
 $code .= _end_func;

 print $code;
 close STDOUT or die "error closing STDOUT: $!";
 exit 0;