Clean up aes-gcm-avx512-x86_64.pl to assume 512-bit vectors

aes-gcm-avx512-x86_64.pl (originally aes-gcm-avx10-x86_64.pl) was
designed to support multiple maximum vector lengths, while still
utilizing AVX512 / AVX10 features such as the increased number of vector
registers.  However, the support for multiple maximum vector lengths
turned out to not be useful.  Support for maximum vector lengths other
than 512 bits was just removed from the AVX10 specification, which
leaves "avoiding downclocking" as the only remaining use case for
limiting AVX512 / AVX10 code to 256-bit vectors.  But the bad 512-bit
downclocking has gone away in new CPUs, and the separate VAES+AVX2 code
which I ended up having to write anyway (for CPUs that support VAES but
not AVX512) provides nearly as good 256-bit support anyway.

Therefore, clean up aes-gcm-avx512-x86_64.pl to not be written in terms
of a generic vector length, but rather just assume 512-bit vectors.

This results in some minor changes to the generated assembly:

- The labels in gcm_init_vpclmulqdq_avx512 and
  gcm_ghash_vpclmulqdq_avx512 no longer have the suffixes that were used
  to differentiate between VL=32 and VL=64.
- gcm_init_vpclmulqdq_avx512 is now in a slightly different place in the
  file, since (like the AVX2 equivalent) it's now generated at the
  top level instead of via a Perl function that gets called later on.
- The inc_2blocks label (only used for VL=32) has been removed.
- The code no longer goes out of its way to avoid using immediates of
  4*VL, which is now always 256.  This was an optimization for VL=32
  which shortened some instructions by 3 bytes by keeping immediates in
  the range [-128, 127].  With VL=64 this optimization is not possible,
  so we might as well just write the "obvious" code instead.

Change-Id: I44027d4a81f7d9bdfd4c27e410de2d0158b10325
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/77848
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl
index 99c2c27..4b98b77 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl
@@ -191,16 +191,14 @@
 .Lgfpoly_and_internal_carrybit:
     .quad   1, 0xc200000000000001
 
-    # The below constants are used for incrementing the counter blocks.
-    # ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
-    # inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
-    # 4.  Note that the same '2' is reused in ctr_pattern and inc_2blocks.
+    # Values needed to prepare the initial vector of counter blocks.
 .Lctr_pattern:
     .quad   0, 0
     .quad   1, 0
-.Linc_2blocks:
     .quad   2, 0
     .quad   3, 0
+
+    # The number of AES blocks per vector, as a 128-bit value.
 .Linc_4blocks:
     .quad   4, 0
 
@@ -216,63 +214,6 @@
 # Offset to 'rounds' in AES_KEY struct
 my $OFFSETOF_AES_ROUNDS = 240;
 
-# The current vector length in bytes
-my $VL;
-
-my (
-    $V0,  $V1,  $V2,  $V3,  $V4,  $V5,  $V6,  $V7,  $V8,  $V9,  $V10,
-    $V11, $V12, $V13, $V14, $V15, $V16, $V17, $V18, $V19, $V20, $V21,
-    $V22, $V23, $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31
-);
-
-# Set the vector length in bytes.  This sets the VL variable and defines
-# register aliases V0-V31 that map to the ymm or zmm registers.
-sub _set_veclen {
-    ($VL) = @_;
-    my $prefix;
-    if ( $VL == 32 ) {
-        $prefix = "%ymm";
-    }
-    elsif ( $VL == 64 ) {
-        $prefix = "%zmm";
-    }
-    else {
-        die "Unsupported vector length";
-    }
-    $V0  = "${prefix}0";
-    $V1  = "${prefix}1";
-    $V2  = "${prefix}2";
-    $V3  = "${prefix}3";
-    $V4  = "${prefix}4";
-    $V5  = "${prefix}5";
-    $V6  = "${prefix}6";
-    $V7  = "${prefix}7";
-    $V8  = "${prefix}8";
-    $V9  = "${prefix}9";
-    $V10 = "${prefix}10";
-    $V11 = "${prefix}11";
-    $V12 = "${prefix}12";
-    $V13 = "${prefix}13";
-    $V14 = "${prefix}14";
-    $V15 = "${prefix}15";
-    $V16 = "${prefix}16";
-    $V17 = "${prefix}17";
-    $V18 = "${prefix}18";
-    $V19 = "${prefix}19";
-    $V20 = "${prefix}20";
-    $V21 = "${prefix}21";
-    $V22 = "${prefix}22";
-    $V23 = "${prefix}23";
-    $V24 = "${prefix}24";
-    $V25 = "${prefix}25";
-    $V26 = "${prefix}26";
-    $V27 = "${prefix}27";
-    $V28 = "${prefix}28";
-    $V29 = "${prefix}29";
-    $V30 = "${prefix}30";
-    $V31 = "${prefix}31";
-}
-
 # The _ghash_mul macro multiplies the 128-bit lanes of \a by the corresponding
 # 128-bit lanes of \b and stores the reduced products in \dst.  \t0, \t1, and
 # \t2 are temporary registers of the same size as \a and \b.
@@ -418,37 +359,29 @@
 ___
 }
 
-my $g_init_macro_expansion_count = 0;
-
-# void gcm_init_##suffix(u128 Htable[16], const uint64_t H[2]);
+# void gcm_init_vpclmulqdq_avx512(u128 Htable[16], const uint64_t H[2]);
 #
 # Initialize |Htable| with powers of the GHASH subkey |H|.
 #
 # The powers are stored in the order H^NUM_H_POWERS to H^1.
-#
-# This macro supports both VL=32 and VL=64.  _set_veclen must have been invoked
-# with the desired length.  In the VL=32 case, the function computes twice as
-# many key powers than are actually used by the VL=32 GCM update functions.
-# This is done to keep the key format the same regardless of vector length.
-sub _aes_gcm_init {
-    my $local_label_suffix = "__func" . ++$g_init_macro_expansion_count;
-
+$code .= _begin_func "gcm_init_vpclmulqdq_avx512", 0;
+{
     # Function arguments
     my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ];
 
     # Additional local variables.  %rax is used as a temporary register.
-    my ( $TMP0, $TMP0_YMM, $TMP0_XMM ) = ( $V0, "%ymm0", "%xmm0" );
-    my ( $TMP1, $TMP1_YMM, $TMP1_XMM ) = ( $V1, "%ymm1", "%xmm1" );
-    my ( $TMP2, $TMP2_YMM, $TMP2_XMM ) = ( $V2, "%ymm2", "%xmm2" );
+    my ( $TMP0, $TMP0_YMM, $TMP0_XMM ) = ( "%zmm0", "%ymm0", "%xmm0" );
+    my ( $TMP1, $TMP1_YMM, $TMP1_XMM ) = ( "%zmm1", "%ymm1", "%xmm1" );
+    my ( $TMP2, $TMP2_YMM, $TMP2_XMM ) = ( "%zmm2", "%ymm2", "%xmm2" );
     my $POWERS_PTR     = "%r8";
     my $RNDKEYLAST_PTR = "%r9";
-    my ( $H_CUR, $H_CUR_YMM, $H_CUR_XMM )    = ( "$V3", "%ymm3", "%xmm3" );
-    my ( $H_INC, $H_INC_YMM, $H_INC_XMM )    = ( "$V4", "%ymm4", "%xmm4" );
-    my ( $GFPOLY, $GFPOLY_YMM, $GFPOLY_XMM ) = ( "$V5", "%ymm5", "%xmm5" );
+    my ( $H_CUR, $H_CUR_YMM, $H_CUR_XMM )    = ( "%zmm3", "%ymm3", "%xmm3" );
+    my ( $H_INC, $H_INC_YMM, $H_INC_XMM )    = ( "%zmm4", "%ymm4", "%xmm4" );
+    my ( $GFPOLY, $GFPOLY_YMM, $GFPOLY_XMM ) = ( "%zmm5", "%ymm5", "%xmm5" );
 
-    my $code = <<___;
+    $code .= <<___;
     # Get pointer to lowest set of key powers (located at end of array).
-    lea             $OFFSETOFEND_H_POWERS-$VL($HTABLE), $POWERS_PTR
+    lea             $OFFSETOFEND_H_POWERS-64($HTABLE), $POWERS_PTR
 
     # Load the byte-reflected hash subkey.  BoringSSL provides it in
     # byte-reflected form except the two halves are in the wrong order.
@@ -491,62 +424,43 @@
     # Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
     vinserti128     \$1, $H_CUR_XMM, $H_INC_YMM, $H_CUR_YMM
     vinserti128     \$1, $H_INC_XMM, $H_INC_YMM, $H_INC_YMM
-___
 
-    if ( $VL == 64 ) {
+    # Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
+    @{[ _ghash_mul  $H_INC_YMM, $H_CUR_YMM, $H_INC_YMM, $GFPOLY_YMM,
+                    $TMP0_YMM, $TMP1_YMM, $TMP2_YMM ]}
+    vinserti64x4    \$1, $H_CUR_YMM, $H_INC, $H_CUR
+    vshufi64x2      \$0, $H_INC, $H_INC, $H_INC
 
-        # Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
-        $code .= <<___;
-        @{[ _ghash_mul  $H_INC_YMM, $H_CUR_YMM, $H_INC_YMM, $GFPOLY_YMM,
-                        $TMP0_YMM, $TMP1_YMM, $TMP2_YMM ]}
-        vinserti64x4    \$1, $H_CUR_YMM, $H_INC, $H_CUR
-        vshufi64x2      \$0, $H_INC, $H_INC, $H_INC
-___
-    }
-
-    $code .= <<___;
     # Store the lowest set of key powers.
     vmovdqu8        $H_CUR, ($POWERS_PTR)
 
-    # Compute and store the remaining key powers.  With VL=32, repeatedly
-    # multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
-    # With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+    # Compute and store the remaining key powers.
+    # Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
     # [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
-    mov             \$@{[ $NUM_H_POWERS*16/$VL - 1 ]}, %eax
-.Lprecompute_next$local_label_suffix:
-    sub             \$$VL, $POWERS_PTR
+    mov             \$3, %eax
+.Lprecompute_next:
+    sub             \$64, $POWERS_PTR
     @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
     vmovdqu8        $H_CUR, ($POWERS_PTR)
     dec             %eax
-    jnz             .Lprecompute_next$local_label_suffix
+    jnz             .Lprecompute_next
 
     vzeroupper      # This is needed after using ymm or zmm registers.
 ___
-    return $code;
 }
+$code .= _end_func;
 
 # XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
 # the result in \dst_xmm.  This implicitly zeroizes the other lanes of dst.
 sub _horizontal_xor {
     my ( $src, $src_xmm, $dst_xmm, $t0_xmm, $t1_xmm, $t2_xmm ) = @_;
-    if ( $VL == 32 ) {
-        return <<___;
-        vextracti32x4   \$1, $src, $t0_xmm
-        vpxord          $t0_xmm, $src_xmm, $dst_xmm
+    return <<___;
+    vextracti32x4   \$1, $src, $t0_xmm
+    vextracti32x4   \$2, $src, $t1_xmm
+    vextracti32x4   \$3, $src, $t2_xmm
+    vpxord          $t0_xmm, $src_xmm, $dst_xmm
+    vpternlogd      \$0x96, $t1_xmm, $t2_xmm, $dst_xmm
 ___
-    }
-    elsif ( $VL == 64 ) {
-        return <<___;
-        vextracti32x4   \$1, $src, $t0_xmm
-        vextracti32x4   \$2, $src, $t1_xmm
-        vextracti32x4   \$3, $src, $t2_xmm
-        vpxord          $t0_xmm, $src_xmm, $dst_xmm
-        vpternlogd      \$0x96, $t1_xmm, $t2_xmm, $dst_xmm
-___
-    }
-    else {
-        die "Unsupported vector length";
-    }
 }
 
 # Do one step of the GHASH update of the data blocks given in the vector
@@ -560,25 +474,21 @@
 #
 # The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
 # H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
-# operations are vectorized operations on vectors of 16-byte blocks.  E.g.,
-# with VL=32 there are 2 blocks per vector and the vectorized terms correspond
-# to the following non-vectorized terms:
+# operations are vectorized operations on 512-bit vectors of 128-bit blocks.
+# The vectorized terms correspond to the following non-vectorized terms:
 #
-#       H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
-#       H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
-#       H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
-#       H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
-#
-# With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
+#       H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM),
+#              H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0)
+#       H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7
+#       H_POW2*GHASHDATA2 => H^8*blk8,  H^7*blk9,  H^6*blk10, and H^5*blk11
+#       H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15
 #
 # More concretely, this code does:
 #   - Do vectorized "schoolbook" multiplications to compute the intermediate
 #     256-bit product of each block and its corresponding hash key power.
-#     There are 4*VL/16 of these intermediate products.
-#   - Sum (XOR) the intermediate 256-bit products across vectors.  This leaves
-#     VL/16 256-bit intermediate values.
-#   - Do a vectorized reduction of these 256-bit intermediate values to
-#     128-bits each.  This leaves VL/16 128-bit intermediate values.
+#   - Sum (XOR) the intermediate 256-bit products across vectors.
+#   - Do a vectorized reduction of these 256-bit intermediate values to 128-bits
+#     each.
 #   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
 #
 # See _ghash_mul for the full explanation of the operations performed for each
@@ -705,42 +615,35 @@
 }
 $code .= _end_func;
 
-my $g_ghash_macro_expansion_count = 0;
-
-# void gcm_ghash_##suffix(uint8_t Xi[16], const u128 Htable[16],
-#                         const uint8_t *in, size_t len);
+# void gcm_ghash_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16],
+#                                  const uint8_t *in, size_t len);
 #
-# This macro generates the body of a GHASH update function with the above
-# prototype.  This macro supports both VL=32 and VL=64.  _set_veclen must have
-# been invoked with the desired length.
-#
-# This function uses the key |Htable| to update the GHASH accumulator |Xi| with
-# the data given by |in| and |len|.  |len| must be a multiple of 16.
+# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
+# by |in| and |len|.  |len| must be a multiple of 16.
 #
 # This function handles large amounts of AAD efficiently, while also keeping the
 # overhead low for small amounts of AAD which is the common case.  TLS uses less
 # than one block of AAD, but (uncommonly) other use cases may use much more.
-sub _ghash_update {
-    my $local_label_suffix = "__func" . ++$g_ghash_macro_expansion_count;
-    my $code               = "";
-
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512", 1;
+{
     # Function arguments
     my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
 
     # Additional local variables
-    my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V0, "%xmm0" );
-    my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V1, "%xmm1" );
-    my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V2, "%xmm2" );
-    my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V3, "%xmm3" );
+    my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( "%zmm0", "%xmm0" );
+    my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( "%zmm1", "%xmm1" );
+    my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( "%zmm2", "%xmm2" );
+    my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( "%zmm3", "%xmm3" );
     my @GHASHDATA = ( $GHASHDATA0, $GHASHDATA1, $GHASHDATA2, $GHASHDATA3 );
     my @GHASHDATA_XMM =
       ( $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM, $GHASHDATA3_XMM );
-    my ( $BSWAP_MASK, $BSWAP_MASK_XMM )       = ( $V4, "%xmm4" );
-    my ( $GHASH_ACC, $GHASH_ACC_XMM )         = ( $V5, "%xmm5" );
-    my ( $H_POW4, $H_POW3, $H_POW2 )          = ( $V6, $V7, $V8 );
-    my ( $H_POW1, $H_POW1_XMM )               = ( $V9, "%xmm9" );
-    my ( $GFPOLY, $GFPOLY_XMM )               = ( $V10, "%xmm10" );
-    my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V11, $V12, $V13 );
+    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%zmm4", "%xmm4" );
+    my ( $GHASH_ACC, $GHASH_ACC_XMM )   = ( "%zmm5", "%xmm5" );
+    my ( $H_POW4, $H_POW3, $H_POW2 )    = ( "%zmm6", "%zmm7", "%zmm8" );
+    my ( $H_POW1, $H_POW1_XMM )         = ( "%zmm9", "%xmm9" );
+    my ( $GFPOLY, $GFPOLY_XMM )         = ( "%zmm10", "%xmm10" );
+    my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) =
+      ( "%zmm11", "%zmm12", "%zmm13" );
 
     $code .= <<___;
     @{[ _save_xmmregs (6 .. 13) ]}
@@ -756,44 +659,44 @@
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
 
-    # Optimize for AADLEN < VL by checking for AADLEN < VL before AADLEN < 4*VL.
-    cmp             \$$VL, $AADLEN
-    jb              .Laad_blockbyblock$local_label_suffix
+    # Optimize for AADLEN < 64 by checking for AADLEN < 64 before AADLEN < 256.
+    cmp             \$64, $AADLEN
+    jb              .Laad_blockbyblock
 
-    # AADLEN >= VL, so we'll operate on full vectors.  Broadcast bswap_mask and
+    # AADLEN >= 64, so we'll operate on full vectors.  Broadcast bswap_mask and
     # gfpoly to all 128-bit lanes.
     vshufi64x2      \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
     vshufi64x2      \$0, $GFPOLY, $GFPOLY, $GFPOLY
 
     # Load the lowest set of key powers.
-    vmovdqu8        $OFFSETOFEND_H_POWERS-1*$VL($HTABLE), $H_POW1
+    vmovdqu8        $OFFSETOFEND_H_POWERS-1*64($HTABLE), $H_POW1
 
-    cmp             \$4*$VL-1, $AADLEN
-    jbe             .Laad_loop_1x$local_label_suffix
+    cmp             \$256, $AADLEN
+    jb              .Laad_loop_1x
 
-    # AADLEN >= 4*VL.  Load the higher key powers.
-    vmovdqu8        $OFFSETOFEND_H_POWERS-4*$VL($HTABLE), $H_POW4
-    vmovdqu8        $OFFSETOFEND_H_POWERS-3*$VL($HTABLE), $H_POW3
-    vmovdqu8        $OFFSETOFEND_H_POWERS-2*$VL($HTABLE), $H_POW2
+    # AADLEN >= 256.  Load the higher key powers.
+    vmovdqu8        $OFFSETOFEND_H_POWERS-4*64($HTABLE), $H_POW4
+    vmovdqu8        $OFFSETOFEND_H_POWERS-3*64($HTABLE), $H_POW3
+    vmovdqu8        $OFFSETOFEND_H_POWERS-2*64($HTABLE), $H_POW2
 
-    # Update GHASH with 4*VL bytes of AAD at a time.
-.Laad_loop_4x$local_label_suffix:
-    vmovdqu8        0*$VL($AAD), $GHASHDATA0
-    vmovdqu8        1*$VL($AAD), $GHASHDATA1
-    vmovdqu8        2*$VL($AAD), $GHASHDATA2
-    vmovdqu8        3*$VL($AAD), $GHASHDATA3
+    # Update GHASH with 256 bytes of AAD at a time.
+.Laad_loop_4x:
+    vmovdqu8        0*64($AAD), $GHASHDATA0
+    vmovdqu8        1*64($AAD), $GHASHDATA1
+    vmovdqu8        2*64($AAD), $GHASHDATA2
+    vmovdqu8        3*64($AAD), $GHASHDATA3
     @{[ _ghash_4x   $BSWAP_MASK, @GHASHDATA, @GHASHDATA_XMM, $H_POW4, $H_POW3,
                     $H_POW2, $H_POW1, $GFPOLY, $GHASHTMP0, $GHASHTMP1,
                     $GHASHTMP2, $GHASH_ACC, $GHASH_ACC_XMM ]}
-    sub             \$-4*$VL, $AAD  # shorter than 'add 4*VL' when VL=32
-    add             \$-4*$VL, $AADLEN
-    cmp             \$4*$VL-1, $AADLEN
-    ja              .Laad_loop_4x$local_label_suffix
+    add             \$256, $AAD
+    sub             \$256, $AADLEN
+    cmp             \$256, $AADLEN
+    jae             .Laad_loop_4x
 
-    # Update GHASH with VL bytes of AAD at a time.
-    cmp             \$$VL, $AADLEN
-    jb              .Laad_large_done$local_label_suffix
-.Laad_loop_1x$local_label_suffix:
+    # Update GHASH with 64 bytes of AAD at a time.
+    cmp             \$64, $AADLEN
+    jb              .Laad_large_done
+.Laad_loop_1x:
     vmovdqu8        ($AAD), $GHASHDATA0
     vpshufb         $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
     vpxord          $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
@@ -801,19 +704,19 @@
                     $GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
     @{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
                         $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
-    add             \$$VL, $AAD
-    sub             \$$VL, $AADLEN
-    cmp             \$$VL, $AADLEN
-    jae             .Laad_loop_1x$local_label_suffix
+    add             \$64, $AAD
+    sub             \$64, $AADLEN
+    cmp             \$64, $AADLEN
+    jae             .Laad_loop_1x
 
-.Laad_large_done$local_label_suffix:
+.Laad_large_done:
 
     # GHASH the remaining data 16 bytes at a time, using xmm registers only.
-.Laad_blockbyblock$local_label_suffix:
+.Laad_blockbyblock:
     test            $AADLEN, $AADLEN
-    jz              .Laad_done$local_label_suffix
+    jz              .Laad_done
     vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1_XMM
-.Laad_loop_blockbyblock$local_label_suffix:
+.Laad_loop_blockbyblock:
     vmovdqu         ($AAD), $GHASHDATA0_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM
     vpxor           $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
@@ -821,17 +724,17 @@
                     $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
     add             \$16, $AAD
     sub             \$16, $AADLEN
-    jnz             .Laad_loop_blockbyblock$local_label_suffix
+    jnz             .Laad_loop_blockbyblock
 
-.Laad_done$local_label_suffix:
+.Laad_done:
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
 
     vzeroupper      # This is needed after using ymm or zmm registers.
 ___
-    return $code;
 }
+$code .= _end_func;
 
 # Do one non-last round of AES encryption on the counter blocks in aesdata[0-3]
 # using the round key that has been broadcast to all 128-bit lanes of round_key.
@@ -883,37 +786,34 @@
         $ghashdata1, $ghashdata2, $ghashdata3
     ) = @_;
     return <<___;
-    vpxord          0*$VL($src), $rndkeylast, $ghashdata0
-    vpxord          1*$VL($src), $rndkeylast, $ghashdata1
-    vpxord          2*$VL($src), $rndkeylast, $ghashdata2
-    vpxord          3*$VL($src), $rndkeylast, $ghashdata3
+    vpxord          0*64($src), $rndkeylast, $ghashdata0
+    vpxord          1*64($src), $rndkeylast, $ghashdata1
+    vpxord          2*64($src), $rndkeylast, $ghashdata2
+    vpxord          3*64($src), $rndkeylast, $ghashdata3
     vaesenclast     $ghashdata0, $aesdata0, $ghashdata0
     vaesenclast     $ghashdata1, $aesdata1, $ghashdata1
     vaesenclast     $ghashdata2, $aesdata2, $ghashdata2
     vaesenclast     $ghashdata3, $aesdata3, $ghashdata3
-    vmovdqu8        $ghashdata0, 0*$VL($dst)
-    vmovdqu8        $ghashdata1, 1*$VL($dst)
-    vmovdqu8        $ghashdata2, 2*$VL($dst)
-    vmovdqu8        $ghashdata3, 3*$VL($dst)
+    vmovdqu8        $ghashdata0, 0*64($dst)
+    vmovdqu8        $ghashdata1, 1*64($dst)
+    vmovdqu8        $ghashdata2, 2*64($dst)
+    vmovdqu8        $ghashdata3, 3*64($dst)
 ___
 }
 
 my $g_update_macro_expansion_count = 0;
 
-# void aes_gcm_{enc,dec}_update_##suffix(const uint8_t *in, uint8_t *out,
-#                                        size_t len, const AES_KEY *key,
-#                                        const uint8_t ivec[16],
-#                                        const u128 Htable[16],
-#                                        uint8_t Xi[16]);
+# void aes_gcm_{enc,dec}_update_vaes_avx512(const uint8_t *in, uint8_t *out,
+#                                           size_t len, const AES_KEY *key,
+#                                           const uint8_t ivec[16],
+#                                           const u128 Htable[16],
+#                                           uint8_t Xi[16]);
 #
 # This macro generates a GCM encryption or decryption update function with the
-# above prototype (with \enc selecting which one).  This macro supports both
-# VL=32 and VL=64.  _set_veclen must have been invoked with the desired length.
-#
-# This function computes the next portion of the CTR keystream, XOR's it with
-# |len| bytes from |in|, and writes the resulting encrypted or decrypted data
-# to |out|.  It also updates the GHASH accumulator |Xi| using the next |len|
-# ciphertext bytes.
+# above prototype (with \enc selecting which one).  The function computes the
+# next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and
+# writes the resulting encrypted or decrypted data to |out|.  It also updates
+# the GHASH accumulator |Xi| using the next |len| ciphertext bytes.
 #
 # |len| must be a multiple of 16, except on the last call where it can be any
 # length.  The caller must do any buffering needed to ensure this.  Both
@@ -946,39 +846,39 @@
     my $RNDKEYLAST_PTR = "%r11";
 
     # AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
-    my ( $AESDATA0, $AESDATA0_XMM ) = ( $V0, "%xmm0" );
-    my ( $AESDATA1, $AESDATA1_XMM ) = ( $V1, "%xmm1" );
-    my ( $AESDATA2, $AESDATA2_XMM ) = ( $V2, "%xmm2" );
-    my ( $AESDATA3, $AESDATA3_XMM ) = ( $V3, "%xmm3" );
+    my ( $AESDATA0, $AESDATA0_XMM ) = ( "%zmm0", "%xmm0" );
+    my ( $AESDATA1, $AESDATA1_XMM ) = ( "%zmm1", "%xmm1" );
+    my ( $AESDATA2, $AESDATA2_XMM ) = ( "%zmm2", "%xmm2" );
+    my ( $AESDATA3, $AESDATA3_XMM ) = ( "%zmm3", "%xmm3" );
     my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 );
 
     # GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
-    my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V4, "%xmm4" );
-    my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V5, "%xmm5" );
-    my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V6, "%xmm6" );
-    my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V7, "%xmm7" );
+    my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( "%zmm4", "%xmm4" );
+    my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( "%zmm5", "%xmm5" );
+    my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( "%zmm6", "%xmm6" );
+    my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( "%zmm7", "%xmm7" );
     my @GHASHDATA = ( $GHASHDATA0, $GHASHDATA1, $GHASHDATA2, $GHASHDATA3 );
     my @GHASHDATA_XMM =
       ( $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM, $GHASHDATA3_XMM );
 
     # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
     # using vpshufb, copied to all 128-bit lanes.
-    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V8, "%xmm8" );
+    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%zmm8", "%xmm8" );
 
     # RNDKEY temporarily holds the next AES round key.
-    my $RNDKEY = $V9;
+    my $RNDKEY = "%zmm9";
 
     # GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
     # only the lowest 128-bit lane can be nonzero.  When not fully reduced,
     # more than one lane may be used, and they need to be XOR'd together.
-    my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( $V10, "%xmm10" );
+    my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%zmm10", "%xmm10" );
 
     # LE_CTR_INC is the vector of 32-bit words that need to be added to a
     # vector of little-endian counter blocks to advance it forwards.
-    my $LE_CTR_INC = $V11;
+    my $LE_CTR_INC = "%zmm11";
 
     # LE_CTR contains the next set of little-endian counter blocks.
-    my $LE_CTR = $V12;
+    my $LE_CTR = "%zmm12";
 
     # RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
     # copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
@@ -987,19 +887,25 @@
         $RNDKEY0,   $RNDKEYLAST, $RNDKEY_M9, $RNDKEY_M8,
         $RNDKEY_M7, $RNDKEY_M6,  $RNDKEY_M5, $RNDKEY_M4,
         $RNDKEY_M3, $RNDKEY_M2,  $RNDKEY_M1
-    ) = ( $V13, $V14, $V15, $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23 );
+      )
+      = (
+        "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18",
+        "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23"
+      );
 
     # GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
     # cannot coincide with anything used for AES encryption, since for
     # performance reasons GHASH and AES encryption are interleaved.
-    my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V24, $V25, $V26 );
+    my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) =
+      ( "%zmm24", "%zmm25", "%zmm26" );
 
-    # H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1.  The
-    # descending numbering reflects the order of the key powers.
-    my ( $H_POW4, $H_POW3, $H_POW2, $H_POW1 ) = ( $V27, $V28, $V29, $V30 );
+    # H_POW[4-1] contain the powers of the hash key H^16...H^1.  The descending
+    # numbering reflects the order of the key powers.
+    my ( $H_POW4, $H_POW3, $H_POW2, $H_POW1 ) =
+      ( "%zmm27", "%zmm28", "%zmm29", "%zmm30" );
 
     # GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
-    my $GFPOLY = $V31;
+    my $GFPOLY = "%zmm31";
 
     my @ghash_4x_args = (
         $BSWAP_MASK, @GHASHDATA, @GHASHDATA_XMM, $H_POW4,
@@ -1029,7 +935,7 @@
         $code .= <<___;
 #ifdef BORINGSSL_DISPATCH_TEST
         .extern BORINGSSL_function_hit
-        movb \$1,BORINGSSL_function_hit+@{[ $VL < 64 ? 6 : 7 ]}(%rip)
+        movb \$1,BORINGSSL_function_hit+7(%rip)
 #endif
 ___
     }
@@ -1057,22 +963,22 @@
     vbroadcasti32x4 ($AESKEY), $RNDKEY0
     vbroadcasti32x4 ($RNDKEYLAST_PTR), $RNDKEYLAST
 
-    # Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
+    # Finish initializing LE_CTR by adding [0, 1, 2, 3] to its low words.
     vpaddd          .Lctr_pattern(%rip), $LE_CTR, $LE_CTR
 
-    # Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
-    vbroadcasti32x4 .Linc_@{[ $VL / 16 ]}blocks(%rip), $LE_CTR_INC
+    # Load 4 into all 128-bit lanes of LE_CTR_INC.
+    vbroadcasti32x4 .Linc_4blocks(%rip), $LE_CTR_INC
 
-    # If there are at least 4*VL bytes of data, then continue into the loop
-    # that processes 4*VL bytes of data at a time.  Otherwise skip it.
-    cmp             \$4*$VL-1, $DATALEN
-    jbe             .Lcrypt_loop_4x_done$local_label_suffix
+    # If there are at least 256 bytes of data, then continue into the loop
+    # that processes 256 bytes of data at a time.  Otherwise skip it.
+    cmp             \$256, $DATALEN
+    jb              .Lcrypt_loop_4x_done$local_label_suffix
 
     # Load powers of the hash key.
-    vmovdqu8        $OFFSETOFEND_H_POWERS-4*$VL($HTABLE), $H_POW4
-    vmovdqu8        $OFFSETOFEND_H_POWERS-3*$VL($HTABLE), $H_POW3
-    vmovdqu8        $OFFSETOFEND_H_POWERS-2*$VL($HTABLE), $H_POW2
-    vmovdqu8        $OFFSETOFEND_H_POWERS-1*$VL($HTABLE), $H_POW1
+    vmovdqu8        $OFFSETOFEND_H_POWERS-4*64($HTABLE), $H_POW4
+    vmovdqu8        $OFFSETOFEND_H_POWERS-3*64($HTABLE), $H_POW3
+    vmovdqu8        $OFFSETOFEND_H_POWERS-2*64($HTABLE), $H_POW2
+    vmovdqu8        $OFFSETOFEND_H_POWERS-1*64($HTABLE), $H_POW1
 ___
 
     # Main loop: en/decrypt and hash 4 vectors at a time.
@@ -1103,11 +1009,11 @@
         cmp             %rax, $RNDKEYLAST_PTR
         jne             .Lvaesenc_loop_first_4_vecs$local_label_suffix
         @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, @GHASHDATA ]}
-        sub             \$-4*$VL, $SRC  # shorter than 'add 4*VL' when VL=32
-        sub             \$-4*$VL, $DST
-        add             \$-4*$VL, $DATALEN
-        cmp             \$4*$VL-1, $DATALEN
-        jbe             .Lghash_last_ciphertext_4x$local_label_suffix
+        add             \$256, $SRC
+        add             \$256, $DST
+        sub             \$256, $DATALEN
+        cmp             \$256, $DATALEN
+        jb              .Lghash_last_ciphertext_4x$local_label_suffix
 ___
     }
 
@@ -1130,10 +1036,10 @@
     # encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
     if ( !$enc ) {
         $code .= <<___;
-        vmovdqu8        0*$VL($SRC), $GHASHDATA0
-        vmovdqu8        1*$VL($SRC), $GHASHDATA1
-        vmovdqu8        2*$VL($SRC), $GHASHDATA2
-        vmovdqu8        3*$VL($SRC), $GHASHDATA3
+        vmovdqu8        0*64($SRC), $GHASHDATA0
+        vmovdqu8        1*64($SRC), $GHASHDATA1
+        vmovdqu8        2*64($SRC), $GHASHDATA2
+        vmovdqu8        3*64($SRC), $GHASHDATA3
 ___
     }
 
@@ -1154,16 +1060,15 @@
     vbroadcasti32x4 -10*16($RNDKEYLAST_PTR), $RNDKEY
     @{[ _vaesenc_4x $RNDKEY, @AESDATA ]}
 .Laes128$local_label_suffix:
-___
 
     # Prefetch the source data 512 bytes ahead into the L1 data cache, to
     # improve performance when the hardware prefetcher is disabled.  Assumes the
     # L1 data cache line size is 64 bytes (de facto standard on x86_64).
-    for ( my $i = 0 ; $i < 4 * $VL ; $i += 64 ) {
-        $code .= "prefetcht0  512+$i($SRC)\n";
-    }
+    prefetcht0      512+0*64($SRC)
+    prefetcht0      512+1*64($SRC)
+    prefetcht0      512+2*64($SRC)
+    prefetcht0      512+3*64($SRC)
 
-    $code .= <<___;
     # Finish the AES encryption of the counter blocks in AESDATA[0-3],
     # interleaved with the GHASH update of the ciphertext blocks in
     # GHASHDATA[0-3].
@@ -1188,11 +1093,11 @@
 
     @{[ _ghash_step_4x  9, @ghash_4x_args ]}
     @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, @GHASHDATA ]}
-    sub             \$-4*$VL, $SRC  # shorter than 'add 4*VL' when VL=32
-    sub             \$-4*$VL, $DST
-    add             \$-4*$VL, $DATALEN
-    cmp             \$4*$VL-1, $DATALEN
-    ja              .Lcrypt_loop_4x$local_label_suffix
+    add             \$256, $SRC
+    add             \$256, $DST
+    sub             \$256, $DATALEN
+    cmp             \$256, $DATALEN
+    jae             .Lcrypt_loop_4x$local_label_suffix
 ___
 
     if ($enc) {
@@ -1212,8 +1117,8 @@
     test            $DATALEN, $DATALEN
     jz              .Ldone$local_label_suffix
 
-    # The data length isn't a multiple of 4*VL.  Process the remaining data
-    # of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
+    # The data length isn't a multiple of 256 bytes.  Process the remaining
+    # data of length 1 <= DATALEN < 256, up to one 64-byte vector at a time.
     # Going one vector at a time may seem inefficient compared to having
     # separate code paths for each possible number of vectors remaining.
     # However, using a loop keeps the code size down, and it performs
@@ -1221,7 +1126,7 @@
     # before the previous one finishes and also predict the number of loop
     # iterations.  For a similar reason, we roll up the AES rounds.
     #
-    # On the last iteration, the remaining length may be less than VL.
+    # On the last iteration, the remaining length may be less than 64 bytes.
     # Handle this using masking.
     #
     # Since there are enough key powers available for all remaining data,
@@ -1246,11 +1151,11 @@
     vpxor           $MI_XMM, $MI_XMM, $MI_XMM
     vpxor           $HI_XMM, $HI_XMM, $HI_XMM
 
-    cmp             \$$VL, $DATALEN
+    cmp             \$64, $DATALEN
     jb              .Lpartial_vec$local_label_suffix
 
 .Lcrypt_loop_1x$local_label_suffix:
-    # Process a full vector of length VL.
+    # Process a full 64-byte vector.
 
     # Encrypt a vector of counter blocks.
     vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA0
@@ -1278,29 +1183,29 @@
                             $GHASHDATA3, $AESDATA1, $AESDATA2, $AESDATA3 ]}
     vpxor           $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
 
-    add             \$$VL, $POWERS_PTR
-    add             \$$VL, $SRC
-    add             \$$VL, $DST
-    sub             \$$VL, $DATALEN
-    cmp             \$$VL, $DATALEN
+    add             \$64, $POWERS_PTR
+    add             \$64, $SRC
+    add             \$64, $DST
+    sub             \$64, $DATALEN
+    cmp             \$64, $DATALEN
     jae             .Lcrypt_loop_1x$local_label_suffix
 
     test            $DATALEN, $DATALEN
     jz              .Lreduce$local_label_suffix
 
 .Lpartial_vec$local_label_suffix:
-    # Process a partial vector of length 1 <= DATALEN < VL.
+    # Process a partial vector of length 1 <= DATALEN < 64.
 
     # Set the data mask %k1 to DATALEN 1's.
     # Set the key powers mask %k2 to round_up(DATALEN, 16) 1's.
     mov             \$-1, %rax
     bzhi            $DATALEN, %rax, %rax
-    @{[ $VL < 64 ? "kmovd %eax, %k1" : "kmovq %rax, %k1" ]}
+    kmovq           %rax, %k1
     add             \$15, $DATALEN
     and             \$-16, $DATALEN
     mov             \$-1, %rax
     bzhi            $DATALEN, %rax, %rax
-    @{[ $VL < 64 ? "kmovd %eax, %k2" : "kmovq %rax, %k2" ]}
+    kmovq           %rax, %k2
 
     # Encrypt one last vector of counter blocks.  This does not need to be
     # masked.  The counter does not need to be incremented here.
@@ -1322,13 +1227,13 @@
 
     # Update GHASH with the ciphertext block(s), without reducing.
     #
-    # In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
-    # (If decrypting, it's done by the above masked load.  If encrypting,
-    # it's done by the below masked register-to-register move.)  Note that
-    # if DATALEN <= VL - 16, there will be additional padding beyond the
-    # padding of the last block specified by GHASH itself; i.e., there may
-    # be whole block(s) that get processed by the GHASH multiplication and
-    # reduction instructions but should not actually be included in the
+    # In the case of DATALEN < 64, the ciphertext is zero-padded to 64
+    # bytes.  (If decrypting, it's done by the above masked load.  If
+    # encrypting, it's done by the below masked register-to-register move.)
+    # Note that if DATALEN <= 48, there will be additional padding beyond
+    # the padding of the last block specified by GHASH itself; i.e., there
+    # may be whole block(s) that get processed by the GHASH multiplication
+    # and reduction instructions but should not actually be included in the
     # GHASH.  However, any such blocks are all-zeroes, and the values that
     # they're multiplied with are also all-zeroes.  Therefore they just add
     # 0 * 0 = 0 to the final GHASH result, which makes no difference.
@@ -1355,35 +1260,6 @@
     return $code;
 }
 
-# Disabled until significant deployment of AVX10/256 is seen.  The separate
-# *_vaes_avx2 implementation provides the only 256-bit support for now.
-#
-# $code .= _begin_func "gcm_init_vpclmulqdq_avx10_256", 0;
-# $code .= _aes_gcm_init;
-# $code .= _end_func;
-#
-# $code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_256", 1;
-# $code .= _ghash_update;
-# $code .= _end_func;
-#
-# $code .= _begin_func "aes_gcm_enc_update_vaes_avx10_256", 1;
-# $code .= _aes_gcm_update 1;
-# $code .= _end_func;
-#
-# $code .= _begin_func "aes_gcm_dec_update_vaes_avx10_256", 1;
-# $code .= _aes_gcm_update 0;
-# $code .= _end_func;
-
-_set_veclen 64;
-
-$code .= _begin_func "gcm_init_vpclmulqdq_avx512", 0;
-$code .= _aes_gcm_init;
-$code .= _end_func;
-
-$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512", 1;
-$code .= _ghash_update;
-$code .= _end_func;
-
 $code .= _begin_func "aes_gcm_enc_update_vaes_avx512", 1;
 $code .= _aes_gcm_update 1;
 $code .= _end_func;
diff --git a/gen/bcm/aes-gcm-avx512-x86_64-apple.S b/gen/bcm/aes-gcm-avx512-x86_64-apple.S
index 157feb5..2ab2442 100644
--- a/gen/bcm/aes-gcm-avx512-x86_64-apple.S
+++ b/gen/bcm/aes-gcm-avx512-x86_64-apple.S
@@ -26,57 +26,17 @@
 .quad	1, 0xc200000000000001
 
 
-
-
-
 L$ctr_pattern:
 .quad	0, 0
 .quad	1, 0
-L$inc_2blocks:
 .quad	2, 0
 .quad	3, 0
+
+
 L$inc_4blocks:
 .quad	4, 0
 
 .text	
-.globl	_gcm_gmult_vpclmulqdq_avx512
-.private_extern _gcm_gmult_vpclmulqdq_avx512
-
-.p2align	5
-_gcm_gmult_vpclmulqdq_avx512:
-
-
-_CET_ENDBR
-
-
-
-	vmovdqu	(%rdi),%xmm0
-	vmovdqu	L$bswap_mask(%rip),%xmm1
-	vmovdqu	256-16(%rsi),%xmm2
-	vmovdqu	L$gfpoly(%rip),%xmm3
-	vpshufb	%xmm1,%xmm0,%xmm0
-
-	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm4
-	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm5
-	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm6
-	vpxord	%xmm6,%xmm5,%xmm5
-	vpclmulqdq	$0x01,%xmm4,%xmm3,%xmm6
-	vpshufd	$0x4e,%xmm4,%xmm4
-	vpternlogd	$0x96,%xmm6,%xmm4,%xmm5
-	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm0
-	vpclmulqdq	$0x01,%xmm5,%xmm3,%xmm4
-	vpshufd	$0x4e,%xmm5,%xmm5
-	vpternlogd	$0x96,%xmm4,%xmm5,%xmm0
-
-
-	vpshufb	%xmm1,%xmm0,%xmm0
-	vmovdqu	%xmm0,(%rdi)
-
-
-	ret
-
-
-
 .globl	_gcm_init_vpclmulqdq_avx512
 .private_extern _gcm_init_vpclmulqdq_avx512
 
@@ -139,6 +99,8 @@
 
 	vinserti128	$1,%xmm3,%ymm4,%ymm3
 	vinserti128	$1,%xmm4,%ymm4,%ymm4
+
+
 	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm0
 	vpclmulqdq	$0x01,%ymm4,%ymm3,%ymm1
 	vpclmulqdq	$0x10,%ymm4,%ymm3,%ymm2
@@ -154,14 +116,14 @@
 	vinserti64x4	$1,%ymm3,%zmm4,%zmm3
 	vshufi64x2	$0,%zmm4,%zmm4,%zmm4
 
+
 	vmovdqu8	%zmm3,(%r8)
 
 
 
 
-
 	movl	$3,%eax
-L$precompute_next__func1:
+L$precompute_next:
 	subq	$64,%r8
 	vpclmulqdq	$0x00,%zmm4,%zmm3,%zmm0
 	vpclmulqdq	$0x01,%zmm4,%zmm3,%zmm1
@@ -177,13 +139,51 @@
 
 	vmovdqu8	%zmm3,(%r8)
 	decl	%eax
-	jnz	L$precompute_next__func1
+	jnz	L$precompute_next
 
 	vzeroupper
 	ret
 
 
 
+.globl	_gcm_gmult_vpclmulqdq_avx512
+.private_extern _gcm_gmult_vpclmulqdq_avx512
+
+.p2align	5
+_gcm_gmult_vpclmulqdq_avx512:
+
+
+_CET_ENDBR
+
+
+
+	vmovdqu	(%rdi),%xmm0
+	vmovdqu	L$bswap_mask(%rip),%xmm1
+	vmovdqu	256-16(%rsi),%xmm2
+	vmovdqu	L$gfpoly(%rip),%xmm3
+	vpshufb	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm4
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm5
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm6
+	vpxord	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm4,%xmm3,%xmm6
+	vpshufd	$0x4e,%xmm4,%xmm4
+	vpternlogd	$0x96,%xmm6,%xmm4,%xmm5
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x01,%xmm5,%xmm3,%xmm4
+	vpshufd	$0x4e,%xmm5,%xmm5
+	vpternlogd	$0x96,%xmm4,%xmm5,%xmm0
+
+
+	vpshufb	%xmm1,%xmm0,%xmm0
+	vmovdqu	%xmm0,(%rdi)
+
+
+	ret
+
+
+
 .globl	_gcm_ghash_vpclmulqdq_avx512
 .private_extern _gcm_ghash_vpclmulqdq_avx512
 
@@ -207,7 +207,7 @@
 
 
 	cmpq	$64,%rcx
-	jb	L$aad_blockbyblock__func1
+	jb	L$aad_blockbyblock
 
 
 
@@ -217,8 +217,8 @@
 
 	vmovdqu8	256-64(%rsi),%zmm9
 
-	cmpq	$256-1,%rcx
-	jbe	L$aad_loop_1x__func1
+	cmpq	$256,%rcx
+	jb	L$aad_loop_1x
 
 
 	vmovdqu8	256-256(%rsi),%zmm6
@@ -226,7 +226,7 @@
 	vmovdqu8	256-128(%rsi),%zmm8
 
 
-L$aad_loop_4x__func1:
+L$aad_loop_4x:
 	vmovdqu8	0(%rdx),%zmm0
 	vmovdqu8	64(%rdx),%zmm1
 	vmovdqu8	128(%rdx),%zmm2
@@ -272,15 +272,15 @@
 	vpxord	%xmm0,%xmm5,%xmm5
 	vpternlogd	$0x96,%xmm1,%xmm2,%xmm5
 
-	subq	$-256,%rdx
-	addq	$-256,%rcx
-	cmpq	$256-1,%rcx
-	ja	L$aad_loop_4x__func1
+	addq	$256,%rdx
+	subq	$256,%rcx
+	cmpq	$256,%rcx
+	jae	L$aad_loop_4x
 
 
 	cmpq	$64,%rcx
-	jb	L$aad_large_done__func1
-L$aad_loop_1x__func1:
+	jb	L$aad_large_done
+L$aad_loop_1x:
 	vmovdqu8	(%rdx),%zmm0
 	vpshufb	%zmm4,%zmm0,%zmm0
 	vpxord	%zmm0,%zmm5,%zmm5
@@ -305,16 +305,16 @@
 	addq	$64,%rdx
 	subq	$64,%rcx
 	cmpq	$64,%rcx
-	jae	L$aad_loop_1x__func1
+	jae	L$aad_loop_1x
 
-L$aad_large_done__func1:
+L$aad_large_done:
 
 
-L$aad_blockbyblock__func1:
+L$aad_blockbyblock:
 	testq	%rcx,%rcx
-	jz	L$aad_done__func1
+	jz	L$aad_done
 	vmovdqu	256-16(%rsi),%xmm9
-L$aad_loop_blockbyblock__func1:
+L$aad_loop_blockbyblock:
 	vmovdqu	(%rdx),%xmm0
 	vpshufb	%xmm4,%xmm0,%xmm0
 	vpxor	%xmm0,%xmm5,%xmm5
@@ -332,9 +332,9 @@
 
 	addq	$16,%rdx
 	subq	$16,%rcx
-	jnz	L$aad_loop_blockbyblock__func1
+	jnz	L$aad_loop_blockbyblock
 
-L$aad_done__func1:
+L$aad_done:
 
 	vpshufb	%xmm4,%xmm5,%xmm5
 	vmovdqu	%xmm5,(%rdi)
@@ -391,8 +391,8 @@
 
 
 
-	cmpq	$256-1,%rdx
-	jbe	L$crypt_loop_4x_done__func1
+	cmpq	$256,%rdx
+	jb	L$crypt_loop_4x_done__func1
 
 
 	vmovdqu8	256-256(%r9),%zmm27
@@ -442,11 +442,11 @@
 	vmovdqu8	%zmm6,128(%rsi)
 	vmovdqu8	%zmm7,192(%rsi)
 
-	subq	$-256,%rdi
-	subq	$-256,%rsi
-	addq	$-256,%rdx
-	cmpq	$256-1,%rdx
-	jbe	L$ghash_last_ciphertext_4x__func1
+	addq	$256,%rdi
+	addq	$256,%rsi
+	subq	$256,%rdx
+	cmpq	$256,%rdx
+	jb	L$ghash_last_ciphertext_4x__func1
 
 	vbroadcasti32x4	-144(%r11),%zmm15
 	vbroadcasti32x4	-128(%r11),%zmm16
@@ -507,6 +507,10 @@
 	vaesenc	%zmm9,%zmm3,%zmm3
 
 L$aes128__func1:
+
+
+
+
 	prefetcht0	512+0(%rdi)
 	prefetcht0	512+64(%rdi)
 	prefetcht0	512+128(%rdi)
@@ -514,6 +518,7 @@
 
 
 
+
 	vpshufb	%zmm8,%zmm4,%zmm4
 	vpxord	%zmm10,%zmm4,%zmm4
 	vpshufb	%zmm8,%zmm5,%zmm5
@@ -623,11 +628,11 @@
 	vmovdqu8	%zmm6,128(%rsi)
 	vmovdqu8	%zmm7,192(%rsi)
 
-	subq	$-256,%rdi
-	subq	$-256,%rsi
-	addq	$-256,%rdx
-	cmpq	$256-1,%rdx
-	ja	L$crypt_loop_4x__func1
+	addq	$256,%rdi
+	addq	$256,%rsi
+	subq	$256,%rdx
+	cmpq	$256,%rdx
+	jae	L$crypt_loop_4x__func1
 L$ghash_last_ciphertext_4x__func1:
 	vpshufb	%zmm8,%zmm4,%zmm4
 	vpxord	%zmm10,%zmm4,%zmm4
@@ -878,8 +883,8 @@
 
 
 
-	cmpq	$256-1,%rdx
-	jbe	L$crypt_loop_4x_done__func2
+	cmpq	$256,%rdx
+	jb	L$crypt_loop_4x_done__func2
 
 
 	vmovdqu8	256-256(%r9),%zmm27
@@ -950,6 +955,10 @@
 	vaesenc	%zmm9,%zmm3,%zmm3
 
 L$aes128__func2:
+
+
+
+
 	prefetcht0	512+0(%rdi)
 	prefetcht0	512+64(%rdi)
 	prefetcht0	512+128(%rdi)
@@ -957,6 +966,7 @@
 
 
 
+
 	vpshufb	%zmm8,%zmm4,%zmm4
 	vpxord	%zmm10,%zmm4,%zmm4
 	vpshufb	%zmm8,%zmm5,%zmm5
@@ -1066,11 +1076,11 @@
 	vmovdqu8	%zmm6,128(%rsi)
 	vmovdqu8	%zmm7,192(%rsi)
 
-	subq	$-256,%rdi
-	subq	$-256,%rsi
-	addq	$-256,%rdx
-	cmpq	$256-1,%rdx
-	ja	L$crypt_loop_4x__func2
+	addq	$256,%rdi
+	addq	$256,%rsi
+	subq	$256,%rdx
+	cmpq	$256,%rdx
+	jae	L$crypt_loop_4x__func2
 L$crypt_loop_4x_done__func2:
 
 	testq	%rdx,%rdx
diff --git a/gen/bcm/aes-gcm-avx512-x86_64-linux.S b/gen/bcm/aes-gcm-avx512-x86_64-linux.S
index 56f1a44..4e77431 100644
--- a/gen/bcm/aes-gcm-avx512-x86_64-linux.S
+++ b/gen/bcm/aes-gcm-avx512-x86_64-linux.S
@@ -26,57 +26,17 @@
 .quad	1, 0xc200000000000001
 
 
-
-
-
 .Lctr_pattern:
 .quad	0, 0
 .quad	1, 0
-.Linc_2blocks:
 .quad	2, 0
 .quad	3, 0
+
+
 .Linc_4blocks:
 .quad	4, 0
 
 .text	
-.globl	gcm_gmult_vpclmulqdq_avx512
-.hidden gcm_gmult_vpclmulqdq_avx512
-.type	gcm_gmult_vpclmulqdq_avx512,@function
-.align	32
-gcm_gmult_vpclmulqdq_avx512:
-.cfi_startproc	
-
-_CET_ENDBR
-
-
-
-	vmovdqu	(%rdi),%xmm0
-	vmovdqu	.Lbswap_mask(%rip),%xmm1
-	vmovdqu	256-16(%rsi),%xmm2
-	vmovdqu	.Lgfpoly(%rip),%xmm3
-	vpshufb	%xmm1,%xmm0,%xmm0
-
-	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm4
-	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm5
-	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm6
-	vpxord	%xmm6,%xmm5,%xmm5
-	vpclmulqdq	$0x01,%xmm4,%xmm3,%xmm6
-	vpshufd	$0x4e,%xmm4,%xmm4
-	vpternlogd	$0x96,%xmm6,%xmm4,%xmm5
-	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm0
-	vpclmulqdq	$0x01,%xmm5,%xmm3,%xmm4
-	vpshufd	$0x4e,%xmm5,%xmm5
-	vpternlogd	$0x96,%xmm4,%xmm5,%xmm0
-
-
-	vpshufb	%xmm1,%xmm0,%xmm0
-	vmovdqu	%xmm0,(%rdi)
-
-
-	ret
-
-.cfi_endproc	
-.size	gcm_gmult_vpclmulqdq_avx512, . - gcm_gmult_vpclmulqdq_avx512
 .globl	gcm_init_vpclmulqdq_avx512
 .hidden gcm_init_vpclmulqdq_avx512
 .type	gcm_init_vpclmulqdq_avx512,@function
@@ -139,6 +99,8 @@
 
 	vinserti128	$1,%xmm3,%ymm4,%ymm3
 	vinserti128	$1,%xmm4,%ymm4,%ymm4
+
+
 	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm0
 	vpclmulqdq	$0x01,%ymm4,%ymm3,%ymm1
 	vpclmulqdq	$0x10,%ymm4,%ymm3,%ymm2
@@ -154,14 +116,14 @@
 	vinserti64x4	$1,%ymm3,%zmm4,%zmm3
 	vshufi64x2	$0,%zmm4,%zmm4,%zmm4
 
+
 	vmovdqu8	%zmm3,(%r8)
 
 
 
 
-
 	movl	$3,%eax
-.Lprecompute_next__func1:
+.Lprecompute_next:
 	subq	$64,%r8
 	vpclmulqdq	$0x00,%zmm4,%zmm3,%zmm0
 	vpclmulqdq	$0x01,%zmm4,%zmm3,%zmm1
@@ -177,13 +139,51 @@
 
 	vmovdqu8	%zmm3,(%r8)
 	decl	%eax
-	jnz	.Lprecompute_next__func1
+	jnz	.Lprecompute_next
 
 	vzeroupper
 	ret
 
 .cfi_endproc	
 .size	gcm_init_vpclmulqdq_avx512, . - gcm_init_vpclmulqdq_avx512
+.globl	gcm_gmult_vpclmulqdq_avx512
+.hidden gcm_gmult_vpclmulqdq_avx512
+.type	gcm_gmult_vpclmulqdq_avx512,@function
+.align	32
+gcm_gmult_vpclmulqdq_avx512:
+.cfi_startproc	
+
+_CET_ENDBR
+
+
+
+	vmovdqu	(%rdi),%xmm0
+	vmovdqu	.Lbswap_mask(%rip),%xmm1
+	vmovdqu	256-16(%rsi),%xmm2
+	vmovdqu	.Lgfpoly(%rip),%xmm3
+	vpshufb	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm4
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm5
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm6
+	vpxord	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x01,%xmm4,%xmm3,%xmm6
+	vpshufd	$0x4e,%xmm4,%xmm4
+	vpternlogd	$0x96,%xmm6,%xmm4,%xmm5
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x01,%xmm5,%xmm3,%xmm4
+	vpshufd	$0x4e,%xmm5,%xmm5
+	vpternlogd	$0x96,%xmm4,%xmm5,%xmm0
+
+
+	vpshufb	%xmm1,%xmm0,%xmm0
+	vmovdqu	%xmm0,(%rdi)
+
+
+	ret
+
+.cfi_endproc	
+.size	gcm_gmult_vpclmulqdq_avx512, . - gcm_gmult_vpclmulqdq_avx512
 .globl	gcm_ghash_vpclmulqdq_avx512
 .hidden gcm_ghash_vpclmulqdq_avx512
 .type	gcm_ghash_vpclmulqdq_avx512,@function
@@ -207,7 +207,7 @@
 
 
 	cmpq	$64,%rcx
-	jb	.Laad_blockbyblock__func1
+	jb	.Laad_blockbyblock
 
 
 
@@ -217,8 +217,8 @@
 
 	vmovdqu8	256-64(%rsi),%zmm9
 
-	cmpq	$256-1,%rcx
-	jbe	.Laad_loop_1x__func1
+	cmpq	$256,%rcx
+	jb	.Laad_loop_1x
 
 
 	vmovdqu8	256-256(%rsi),%zmm6
@@ -226,7 +226,7 @@
 	vmovdqu8	256-128(%rsi),%zmm8
 
 
-.Laad_loop_4x__func1:
+.Laad_loop_4x:
 	vmovdqu8	0(%rdx),%zmm0
 	vmovdqu8	64(%rdx),%zmm1
 	vmovdqu8	128(%rdx),%zmm2
@@ -272,15 +272,15 @@
 	vpxord	%xmm0,%xmm5,%xmm5
 	vpternlogd	$0x96,%xmm1,%xmm2,%xmm5
 
-	subq	$-256,%rdx
-	addq	$-256,%rcx
-	cmpq	$256-1,%rcx
-	ja	.Laad_loop_4x__func1
+	addq	$256,%rdx
+	subq	$256,%rcx
+	cmpq	$256,%rcx
+	jae	.Laad_loop_4x
 
 
 	cmpq	$64,%rcx
-	jb	.Laad_large_done__func1
-.Laad_loop_1x__func1:
+	jb	.Laad_large_done
+.Laad_loop_1x:
 	vmovdqu8	(%rdx),%zmm0
 	vpshufb	%zmm4,%zmm0,%zmm0
 	vpxord	%zmm0,%zmm5,%zmm5
@@ -305,16 +305,16 @@
 	addq	$64,%rdx
 	subq	$64,%rcx
 	cmpq	$64,%rcx
-	jae	.Laad_loop_1x__func1
+	jae	.Laad_loop_1x
 
-.Laad_large_done__func1:
+.Laad_large_done:
 
 
-.Laad_blockbyblock__func1:
+.Laad_blockbyblock:
 	testq	%rcx,%rcx
-	jz	.Laad_done__func1
+	jz	.Laad_done
 	vmovdqu	256-16(%rsi),%xmm9
-.Laad_loop_blockbyblock__func1:
+.Laad_loop_blockbyblock:
 	vmovdqu	(%rdx),%xmm0
 	vpshufb	%xmm4,%xmm0,%xmm0
 	vpxor	%xmm0,%xmm5,%xmm5
@@ -332,9 +332,9 @@
 
 	addq	$16,%rdx
 	subq	$16,%rcx
-	jnz	.Laad_loop_blockbyblock__func1
+	jnz	.Laad_loop_blockbyblock
 
-.Laad_done__func1:
+.Laad_done:
 
 	vpshufb	%xmm4,%xmm5,%xmm5
 	vmovdqu	%xmm5,(%rdi)
@@ -393,8 +393,8 @@
 
 
 
-	cmpq	$256-1,%rdx
-	jbe	.Lcrypt_loop_4x_done__func1
+	cmpq	$256,%rdx
+	jb	.Lcrypt_loop_4x_done__func1
 
 
 	vmovdqu8	256-256(%r9),%zmm27
@@ -444,11 +444,11 @@
 	vmovdqu8	%zmm6,128(%rsi)
 	vmovdqu8	%zmm7,192(%rsi)
 
-	subq	$-256,%rdi
-	subq	$-256,%rsi
-	addq	$-256,%rdx
-	cmpq	$256-1,%rdx
-	jbe	.Lghash_last_ciphertext_4x__func1
+	addq	$256,%rdi
+	addq	$256,%rsi
+	subq	$256,%rdx
+	cmpq	$256,%rdx
+	jb	.Lghash_last_ciphertext_4x__func1
 
 	vbroadcasti32x4	-144(%r11),%zmm15
 	vbroadcasti32x4	-128(%r11),%zmm16
@@ -509,6 +509,10 @@
 	vaesenc	%zmm9,%zmm3,%zmm3
 
 .Laes128__func1:
+
+
+
+
 	prefetcht0	512+0(%rdi)
 	prefetcht0	512+64(%rdi)
 	prefetcht0	512+128(%rdi)
@@ -516,6 +520,7 @@
 
 
 
+
 	vpshufb	%zmm8,%zmm4,%zmm4
 	vpxord	%zmm10,%zmm4,%zmm4
 	vpshufb	%zmm8,%zmm5,%zmm5
@@ -625,11 +630,11 @@
 	vmovdqu8	%zmm6,128(%rsi)
 	vmovdqu8	%zmm7,192(%rsi)
 
-	subq	$-256,%rdi
-	subq	$-256,%rsi
-	addq	$-256,%rdx
-	cmpq	$256-1,%rdx
-	ja	.Lcrypt_loop_4x__func1
+	addq	$256,%rdi
+	addq	$256,%rsi
+	subq	$256,%rdx
+	cmpq	$256,%rdx
+	jae	.Lcrypt_loop_4x__func1
 .Lghash_last_ciphertext_4x__func1:
 	vpshufb	%zmm8,%zmm4,%zmm4
 	vpxord	%zmm10,%zmm4,%zmm4
@@ -882,8 +887,8 @@
 
 
 
-	cmpq	$256-1,%rdx
-	jbe	.Lcrypt_loop_4x_done__func2
+	cmpq	$256,%rdx
+	jb	.Lcrypt_loop_4x_done__func2
 
 
 	vmovdqu8	256-256(%r9),%zmm27
@@ -954,6 +959,10 @@
 	vaesenc	%zmm9,%zmm3,%zmm3
 
 .Laes128__func2:
+
+
+
+
 	prefetcht0	512+0(%rdi)
 	prefetcht0	512+64(%rdi)
 	prefetcht0	512+128(%rdi)
@@ -961,6 +970,7 @@
 
 
 
+
 	vpshufb	%zmm8,%zmm4,%zmm4
 	vpxord	%zmm10,%zmm4,%zmm4
 	vpshufb	%zmm8,%zmm5,%zmm5
@@ -1070,11 +1080,11 @@
 	vmovdqu8	%zmm6,128(%rsi)
 	vmovdqu8	%zmm7,192(%rsi)
 
-	subq	$-256,%rdi
-	subq	$-256,%rsi
-	addq	$-256,%rdx
-	cmpq	$256-1,%rdx
-	ja	.Lcrypt_loop_4x__func2
+	addq	$256,%rdi
+	addq	$256,%rsi
+	subq	$256,%rdx
+	cmpq	$256,%rdx
+	jae	.Lcrypt_loop_4x__func2
 .Lcrypt_loop_4x_done__func2:
 
 	testq	%rdx,%rdx
diff --git a/gen/bcm/aes-gcm-avx512-x86_64-win.asm b/gen/bcm/aes-gcm-avx512-x86_64-win.asm
index 6e06094..76811a0 100644
--- a/gen/bcm/aes-gcm-avx512-x86_64-win.asm
+++ b/gen/bcm/aes-gcm-avx512-x86_64-win.asm
@@ -33,63 +33,18 @@
 	DQ	1,0xc200000000000001
 
 
-
-
-
 $L$ctr_pattern:
 	DQ	0,0
 	DQ	1,0
-$L$inc_2blocks:
 	DQ	2,0
 	DQ	3,0
+
+
 $L$inc_4blocks:
 	DQ	4,0
 
 section	.text code align=64
 
-global	gcm_gmult_vpclmulqdq_avx512
-
-ALIGN	32
-gcm_gmult_vpclmulqdq_avx512:
-
-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx512_1:
-_CET_ENDBR
-	sub	rsp,24
-$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx512_2:
-	vmovdqa	XMMWORD[rsp],xmm6
-$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx512_3:
-
-$L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx512_4:
-
-	vmovdqu	xmm0,XMMWORD[rcx]
-	vmovdqu	xmm1,XMMWORD[$L$bswap_mask]
-	vmovdqu	xmm2,XMMWORD[((256-16))+rdx]
-	vmovdqu	xmm3,XMMWORD[$L$gfpoly]
-	vpshufb	xmm0,xmm0,xmm1
-
-	vpclmulqdq	xmm4,xmm0,xmm2,0x00
-	vpclmulqdq	xmm5,xmm0,xmm2,0x01
-	vpclmulqdq	xmm6,xmm0,xmm2,0x10
-	vpxord	xmm5,xmm5,xmm6
-	vpclmulqdq	xmm6,xmm3,xmm4,0x01
-	vpshufd	xmm4,xmm4,0x4e
-	vpternlogd	xmm5,xmm4,xmm6,0x96
-	vpclmulqdq	xmm0,xmm0,xmm2,0x11
-	vpclmulqdq	xmm4,xmm3,xmm5,0x01
-	vpshufd	xmm5,xmm5,0x4e
-	vpternlogd	xmm0,xmm5,xmm4,0x96
-
-
-	vpshufb	xmm0,xmm0,xmm1
-	vmovdqu	XMMWORD[rcx],xmm0
-
-
-	vmovdqa	xmm6,XMMWORD[rsp]
-	add	rsp,24
-	ret
-$L$SEH_end_gcm_gmult_vpclmulqdq_avx512_5:
-
-
 global	gcm_init_vpclmulqdq_avx512
 
 ALIGN	32
@@ -151,6 +106,8 @@
 
 	vinserti128	ymm3,ymm4,xmm3,1
 	vinserti128	ymm4,ymm4,xmm4,1
+
+
 	vpclmulqdq	ymm0,ymm3,ymm4,0x00
 	vpclmulqdq	ymm1,ymm3,ymm4,0x01
 	vpclmulqdq	ymm2,ymm3,ymm4,0x10
@@ -166,14 +123,14 @@
 	vinserti64x4	zmm3,zmm4,ymm3,1
 	vshufi64x2	zmm4,zmm4,zmm4,0
 
+
 	vmovdqu8	ZMMWORD[r8],zmm3
 
 
 
 
-
 	mov	eax,3
-$L$precompute_next__func1:
+$L$precompute_next:
 	sub	r8,64
 	vpclmulqdq	zmm0,zmm3,zmm4,0x00
 	vpclmulqdq	zmm1,zmm3,zmm4,0x01
@@ -189,13 +146,56 @@
 
 	vmovdqu8	ZMMWORD[r8],zmm3
 	dec	eax
-	jnz	NEAR $L$precompute_next__func1
+	jnz	NEAR $L$precompute_next
 
 	vzeroupper
 	ret
 
 
 
+global	gcm_gmult_vpclmulqdq_avx512
+
+ALIGN	32
+gcm_gmult_vpclmulqdq_avx512:
+
+$L$SEH_begin_gcm_gmult_vpclmulqdq_avx512_1:
+_CET_ENDBR
+	sub	rsp,24
+$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx512_2:
+	vmovdqa	XMMWORD[rsp],xmm6
+$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx512_3:
+
+$L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx512_4:
+
+	vmovdqu	xmm0,XMMWORD[rcx]
+	vmovdqu	xmm1,XMMWORD[$L$bswap_mask]
+	vmovdqu	xmm2,XMMWORD[((256-16))+rdx]
+	vmovdqu	xmm3,XMMWORD[$L$gfpoly]
+	vpshufb	xmm0,xmm0,xmm1
+
+	vpclmulqdq	xmm4,xmm0,xmm2,0x00
+	vpclmulqdq	xmm5,xmm0,xmm2,0x01
+	vpclmulqdq	xmm6,xmm0,xmm2,0x10
+	vpxord	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm6,xmm3,xmm4,0x01
+	vpshufd	xmm4,xmm4,0x4e
+	vpternlogd	xmm5,xmm4,xmm6,0x96
+	vpclmulqdq	xmm0,xmm0,xmm2,0x11
+	vpclmulqdq	xmm4,xmm3,xmm5,0x01
+	vpshufd	xmm5,xmm5,0x4e
+	vpternlogd	xmm0,xmm5,xmm4,0x96
+
+
+	vpshufb	xmm0,xmm0,xmm1
+	vmovdqu	XMMWORD[rcx],xmm0
+
+
+	vmovdqa	xmm6,XMMWORD[rsp]
+	add	rsp,24
+	ret
+$L$SEH_end_gcm_gmult_vpclmulqdq_avx512_5:
+
+
 global	gcm_ghash_vpclmulqdq_avx512
 
 ALIGN	32
@@ -236,7 +236,7 @@
 
 
 	cmp	r9,64
-	jb	NEAR $L$aad_blockbyblock__func1
+	jb	NEAR $L$aad_blockbyblock
 
 
 
@@ -246,8 +246,8 @@
 
 	vmovdqu8	zmm9,ZMMWORD[((256-64))+rdx]
 
-	cmp	r9,4*64-1
-	jbe	NEAR $L$aad_loop_1x__func1
+	cmp	r9,256
+	jb	NEAR $L$aad_loop_1x
 
 
 	vmovdqu8	zmm6,ZMMWORD[((256-256))+rdx]
@@ -255,7 +255,7 @@
 	vmovdqu8	zmm8,ZMMWORD[((256-128))+rdx]
 
 
-$L$aad_loop_4x__func1:
+$L$aad_loop_4x:
 	vmovdqu8	zmm0,ZMMWORD[r8]
 	vmovdqu8	zmm1,ZMMWORD[64+r8]
 	vmovdqu8	zmm2,ZMMWORD[128+r8]
@@ -301,15 +301,15 @@
 	vpxord	xmm5,xmm5,xmm0
 	vpternlogd	xmm5,xmm2,xmm1,0x96
 
-	sub	r8,-4*64
-	add	r9,-4*64
-	cmp	r9,4*64-1
-	ja	NEAR $L$aad_loop_4x__func1
+	add	r8,256
+	sub	r9,256
+	cmp	r9,256
+	jae	NEAR $L$aad_loop_4x
 
 
 	cmp	r9,64
-	jb	NEAR $L$aad_large_done__func1
-$L$aad_loop_1x__func1:
+	jb	NEAR $L$aad_large_done
+$L$aad_loop_1x:
 	vmovdqu8	zmm0,ZMMWORD[r8]
 	vpshufb	zmm0,zmm0,zmm4
 	vpxord	zmm5,zmm5,zmm0
@@ -334,16 +334,16 @@
 	add	r8,64
 	sub	r9,64
 	cmp	r9,64
-	jae	NEAR $L$aad_loop_1x__func1
+	jae	NEAR $L$aad_loop_1x
 
-$L$aad_large_done__func1:
+$L$aad_large_done:
 
 
-$L$aad_blockbyblock__func1:
+$L$aad_blockbyblock:
 	test	r9,r9
-	jz	NEAR $L$aad_done__func1
+	jz	NEAR $L$aad_done
 	vmovdqu	xmm9,XMMWORD[((256-16))+rdx]
-$L$aad_loop_blockbyblock__func1:
+$L$aad_loop_blockbyblock:
 	vmovdqu	xmm0,XMMWORD[r8]
 	vpshufb	xmm0,xmm0,xmm4
 	vpxor	xmm5,xmm5,xmm0
@@ -361,9 +361,9 @@
 
 	add	r8,16
 	sub	r9,16
-	jnz	NEAR $L$aad_loop_blockbyblock__func1
+	jnz	NEAR $L$aad_loop_blockbyblock
 
-$L$aad_done__func1:
+$L$aad_done:
 
 	vpshufb	xmm5,xmm5,xmm4
 	vmovdqu	XMMWORD[rcx],xmm5
@@ -458,8 +458,8 @@
 
 
 
-	cmp	r8,4*64-1
-	jbe	NEAR $L$crypt_loop_4x_done__func1
+	cmp	r8,256
+	jb	NEAR $L$crypt_loop_4x_done__func1
 
 
 	vmovdqu8	zmm27,ZMMWORD[((256-256))+rdi]
@@ -509,11 +509,11 @@
 	vmovdqu8	ZMMWORD[128+rdx],zmm6
 	vmovdqu8	ZMMWORD[192+rdx],zmm7
 
-	sub	rcx,-4*64
-	sub	rdx,-4*64
-	add	r8,-4*64
-	cmp	r8,4*64-1
-	jbe	NEAR $L$ghash_last_ciphertext_4x__func1
+	add	rcx,256
+	add	rdx,256
+	sub	r8,256
+	cmp	r8,256
+	jb	NEAR $L$ghash_last_ciphertext_4x__func1
 
 	vbroadcasti32x4	zmm15,ZMMWORD[((-144))+r11]
 	vbroadcasti32x4	zmm16,ZMMWORD[((-128))+r11]
@@ -574,6 +574,10 @@
 	vaesenc	zmm3,zmm3,zmm9
 
 $L$aes128__func1:
+
+
+
+
 	prefetcht0	[((512+0))+rcx]
 	prefetcht0	[((512+64))+rcx]
 	prefetcht0	[((512+128))+rcx]
@@ -581,6 +585,7 @@
 
 
 
+
 	vpshufb	zmm4,zmm4,zmm8
 	vpxord	zmm4,zmm4,zmm10
 	vpshufb	zmm5,zmm5,zmm8
@@ -690,11 +695,11 @@
 	vmovdqu8	ZMMWORD[128+rdx],zmm6
 	vmovdqu8	ZMMWORD[192+rdx],zmm7
 
-	sub	rcx,-4*64
-	sub	rdx,-4*64
-	add	r8,-4*64
-	cmp	r8,4*64-1
-	ja	NEAR $L$crypt_loop_4x__func1
+	add	rcx,256
+	add	rdx,256
+	sub	r8,256
+	cmp	r8,256
+	jae	NEAR $L$crypt_loop_4x__func1
 $L$ghash_last_ciphertext_4x__func1:
 	vpshufb	zmm4,zmm4,zmm8
 	vpxord	zmm4,zmm4,zmm10
@@ -986,8 +991,8 @@
 
 
 
-	cmp	r8,4*64-1
-	jbe	NEAR $L$crypt_loop_4x_done__func2
+	cmp	r8,256
+	jb	NEAR $L$crypt_loop_4x_done__func2
 
 
 	vmovdqu8	zmm27,ZMMWORD[((256-256))+rdi]
@@ -1058,6 +1063,10 @@
 	vaesenc	zmm3,zmm3,zmm9
 
 $L$aes128__func2:
+
+
+
+
 	prefetcht0	[((512+0))+rcx]
 	prefetcht0	[((512+64))+rcx]
 	prefetcht0	[((512+128))+rcx]
@@ -1065,6 +1074,7 @@
 
 
 
+
 	vpshufb	zmm4,zmm4,zmm8
 	vpxord	zmm4,zmm4,zmm10
 	vpshufb	zmm5,zmm5,zmm8
@@ -1174,11 +1184,11 @@
 	vmovdqu8	ZMMWORD[128+rdx],zmm6
 	vmovdqu8	ZMMWORD[192+rdx],zmm7
 
-	sub	rcx,-4*64
-	sub	rdx,-4*64
-	add	r8,-4*64
-	cmp	r8,4*64-1
-	ja	NEAR $L$crypt_loop_4x__func2
+	add	rcx,256
+	add	rdx,256
+	sub	r8,256
+	cmp	r8,256
+	jae	NEAR $L$crypt_loop_4x__func2
 $L$crypt_loop_4x_done__func2:
 
 	test	r8,r8