aes-gcm-avx10-x86_64.pl: use strict mode and sync with avx2 code
Make aes-gcm-avx10-x86_64.pl run in perl's strict mode, like
aes-gcm-avx2-x86_64.pl does. Also bring in some of the other
non-functional changes to the perl code in the avx2 version, like naming
V0-V3 as AESDATA0-AESDATA3 in the en/decryption function.
No change to the generated assembly code except whitespace.
Change-Id: I5857cfdcc881a34c971959a1962dfb181eb11450
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/77168
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
index 36aef35..e8cf3be 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
@@ -71,10 +71,14 @@
# 32, masking support, and new instructions such as vpternlogd (which can do a
# three-argument XOR). These features are very useful for AES-GCM.
-$flavour = shift;
-$output = shift;
+use strict;
+
+my $flavour = shift;
+my $output = shift;
if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; }
+my $win64;
+my @argregs;
if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) {
$win64 = 1;
@argregs = ( "%rcx", "%rdx", "%r8", "%r9" );
@@ -85,7 +89,8 @@
}
$0 =~ m/(.*[\/\\])[^\/\\]+$/;
-$dir = $1;
+my $dir = $1;
+my $xlate;
( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate )
or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate )
or die "can't locate x86_64-xlate.pl";
@@ -93,6 +98,11 @@
open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT = *OUT;
+my $g_cur_func_name;
+my $g_cur_func_uses_seh;
+my @g_cur_func_saved_gpregs;
+my @g_cur_func_saved_xmmregs;
+
sub _begin_func {
my ( $funcname, $uses_seh ) = @_;
$g_cur_func_name = $funcname;
@@ -189,7 +199,7 @@
return $code;
}
-$code = <<___;
+my $code = <<___;
.section .rodata
.align 64
@@ -229,31 +239,68 @@
# Number of powers of the hash key stored in the key struct. The powers are
# stored from highest (H^NUM_H_POWERS) to lowest (H^1).
-$NUM_H_POWERS = 16;
+my $NUM_H_POWERS = 16;
-$OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16;
+my $OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16;
# Offset to 'rounds' in AES_KEY struct
-$OFFSETOF_AES_ROUNDS = 240;
+my $OFFSETOF_AES_ROUNDS = 240;
# The current vector length in bytes
-undef $VL;
+my $VL;
+
+my (
+ $V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, $V8, $V9, $V10,
+ $V11, $V12, $V13, $V14, $V15, $V16, $V17, $V18, $V19, $V20, $V21,
+ $V22, $V23, $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31
+);
# Set the vector length in bytes. This sets the VL variable and defines
# register aliases V0-V31 that map to the ymm or zmm registers.
sub _set_veclen {
($VL) = @_;
- foreach my $i ( 0 .. 31 ) {
- if ( $VL == 32 ) {
- ${"V${i}"} = "%ymm${i}";
- }
- elsif ( $VL == 64 ) {
- ${"V${i}"} = "%zmm${i}";
- }
- else {
- die "Unsupported vector length";
- }
+ my $prefix;
+ if ( $VL == 32 ) {
+ $prefix = "%ymm";
}
+ elsif ( $VL == 64 ) {
+ $prefix = "%zmm";
+ }
+ else {
+ die "Unsupported vector length";
+ }
+ $V0 = "${prefix}0";
+ $V1 = "${prefix}1";
+ $V2 = "${prefix}2";
+ $V3 = "${prefix}3";
+ $V4 = "${prefix}4";
+ $V5 = "${prefix}5";
+ $V6 = "${prefix}6";
+ $V7 = "${prefix}7";
+ $V8 = "${prefix}8";
+ $V9 = "${prefix}9";
+ $V10 = "${prefix}10";
+ $V11 = "${prefix}11";
+ $V12 = "${prefix}12";
+ $V13 = "${prefix}13";
+ $V14 = "${prefix}14";
+ $V15 = "${prefix}15";
+ $V16 = "${prefix}16";
+ $V17 = "${prefix}17";
+ $V18 = "${prefix}18";
+ $V19 = "${prefix}19";
+ $V20 = "${prefix}20";
+ $V21 = "${prefix}21";
+ $V22 = "${prefix}22";
+ $V23 = "${prefix}23";
+ $V24 = "${prefix}24";
+ $V25 = "${prefix}25";
+ $V26 = "${prefix}26";
+ $V27 = "${prefix}27";
+ $V28 = "${prefix}28";
+ $V29 = "${prefix}29";
+ $V30 = "${prefix}30";
+ $V31 = "${prefix}31";
}
# The _ghash_mul macro multiplies the 128-bit lanes of \a by the corresponding
@@ -401,7 +448,7 @@
___
}
-$g_init_macro_expansion_count = 0;
+my $g_init_macro_expansion_count = 0;
# void gcm_init_##suffix(u128 Htable[16], const uint64_t H[2]);
#
@@ -419,7 +466,10 @@
# Function arguments
my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ];
- # Additional local variables. V0-V2 and %rax are used as temporaries.
+ # Additional local variables. %rax is used as a temporary register.
+ my ( $TMP0, $TMP0_YMM, $TMP0_XMM ) = ( $V0, "%ymm0", "%xmm0" );
+ my ( $TMP1, $TMP1_YMM, $TMP1_XMM ) = ( $V1, "%ymm1", "%xmm1" );
+ my ( $TMP2, $TMP2_YMM, $TMP2_XMM ) = ( $V2, "%ymm2", "%xmm2" );
my $POWERS_PTR = "%r8";
my $RNDKEYLAST_PTR = "%r9";
my ( $H_CUR, $H_CUR_YMM, $H_CUR_XMM ) = ( "$V3", "%ymm3", "%xmm3" );
@@ -449,11 +499,11 @@
# << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit
# wide shift instruction, so instead double each of the two 64-bit
# halves and incorporate the internal carry bit into the value XOR'd.
- vpshufd \$0xd3, $H_CUR_XMM, %xmm0
- vpsrad \$31, %xmm0, %xmm0
+ vpshufd \$0xd3, $H_CUR_XMM, $TMP0_XMM
+ vpsrad \$31, $TMP0_XMM, $TMP0_XMM
vpaddq $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM
- # H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit
- vpternlogd \$0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, $H_CUR_XMM
+ # H_CUR_XMM ^= TMP0_XMM & gfpoly_and_internal_carrybit
+ vpternlogd \$0x78, .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $H_CUR_XMM
# Load the gfpoly constant.
vbroadcasti32x4 .Lgfpoly(%rip), $GFPOLY
@@ -466,7 +516,7 @@
# end up with two factors of x^-1, but the multiplication consumes one.
# So the product H^2 ends up with the desired one factor of x^-1.
@{[ _ghash_mul $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM,
- "%xmm0", "%xmm1", "%xmm2" ]}
+ $TMP0_XMM, $TMP1_XMM, $TMP2_XMM ]}
# Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
vinserti128 \$1, $H_CUR_XMM, $H_INC_YMM, $H_CUR_YMM
@@ -478,7 +528,7 @@
# Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
$code .= <<___;
@{[ _ghash_mul $H_INC_YMM, $H_CUR_YMM, $H_INC_YMM, $GFPOLY_YMM,
- "%ymm0", "%ymm1", "%ymm2" ]}
+ $TMP0_YMM, $TMP1_YMM, $TMP2_YMM ]}
vinserti64x4 \$1, $H_CUR_YMM, $H_INC, $H_CUR
vshufi64x2 \$0, $H_INC, $H_INC, $H_INC
___
@@ -495,7 +545,7 @@
mov \$@{[ $NUM_H_POWERS*16/$VL - 1 ]}, %eax
.Lprecompute_next$local_label_suffix:
sub \$$VL, $POWERS_PTR
- @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR, $GFPOLY, $V0, $V1, $V2 ]}
+ @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
vmovdqu8 $H_CUR, ($POWERS_PTR)
dec %eax
jnz .Lprecompute_next$local_label_suffix
@@ -564,7 +614,13 @@
# See _ghash_mul for the full explanation of the operations performed for each
# individual finite field multiplication and reduction.
sub _ghash_step_4x {
- my ($i) = @_;
+ my (
+ $i, $BSWAP_MASK, $GHASHDATA0, $GHASHDATA1,
+ $GHASHDATA2, $GHASHDATA3, $GHASHDATA0_XMM, $GHASHDATA1_XMM,
+ $GHASHDATA2_XMM, $GHASHDATA3_XMM, $H_POW4, $H_POW3,
+ $H_POW2, $H_POW1, $GFPOLY, $GHASHTMP0,
+ $GHASHTMP1, $GHASHTMP2, $GHASH_ACC, $GHASH_ACC_XMM
+ ) = @_;
if ( $i == 0 ) {
return <<___;
vpshufb $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
@@ -642,17 +698,17 @@
}
}
-# Update GHASH with the blocks given in GHASHDATA[0-3].
-# See _ghash_step_4x for full explanation.
+# Update GHASH with four vectors of data blocks. See _ghash_step_4x for full
+# explanation.
sub _ghash_4x {
my $code = "";
for my $i ( 0 .. 9 ) {
- $code .= _ghash_step_4x $i;
+ $code .= _ghash_step_4x $i, @_;
}
return $code;
}
-$g_ghash_macro_expansion_count = 0;
+my $g_ghash_macro_expansion_count = 0;
# void gcm_ghash_##suffix(uint8_t Xi[16], const u128 Htable[16],
# const uint8_t *in, size_t len);
@@ -661,10 +717,8 @@
# prototype. This macro supports both VL=32 and VL=64. _set_veclen must have
# been invoked with the desired length.
#
-# The generated function processes the AAD (Additional Authenticated Data) in
-# GCM. Using the key |Htable|, it updates the GHASH accumulator |Xi| with the
-# data given by |in| and |len|. On the first call, |Xi| must be all zeroes.
-# |len| must be a multiple of 16.
+# This function uses the key |Htable| to update the GHASH accumulator |Xi| with
+# the data given by |in| and |len|. |len| must be a multiple of 16.
#
# This function handles large amounts of AAD efficiently, while also keeping the
# overhead low for small amounts of AAD which is the common case. TLS uses less
@@ -674,19 +728,22 @@
my $code = "";
# Function arguments
- my ( $GHASH_ACC_PTR, $H_POWERS, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
+ my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
# Additional local variables
- ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V0, "%xmm0" );
- ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V1, "%xmm1" );
- ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V2, "%xmm2" );
- ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V3, "%xmm3" );
- ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V4, "%xmm4" );
- ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( $V5, "%xmm5" );
- ( $H_POW4, $H_POW3, $H_POW2 ) = ( $V6, $V7, $V8 );
- ( $H_POW1, $H_POW1_XMM ) = ( $V9, "%xmm9" );
- ( $GFPOLY, $GFPOLY_XMM ) = ( $V10, "%xmm10" );
- ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V11, $V12, $V13 );
+ my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V0, "%xmm0" );
+ my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V1, "%xmm1" );
+ my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V2, "%xmm2" );
+ my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V3, "%xmm3" );
+ my @GHASHDATA = ( $GHASHDATA0, $GHASHDATA1, $GHASHDATA2, $GHASHDATA3 );
+ my @GHASHDATA_XMM =
+ ( $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM, $GHASHDATA3_XMM );
+ my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V4, "%xmm4" );
+ my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( $V5, "%xmm5" );
+ my ( $H_POW4, $H_POW3, $H_POW2 ) = ( $V6, $V7, $V8 );
+ my ( $H_POW1, $H_POW1_XMM ) = ( $V9, "%xmm9" );
+ my ( $GFPOLY, $GFPOLY_XMM ) = ( $V10, "%xmm10" );
+ my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V11, $V12, $V13 );
$code .= <<___;
@{[ _save_xmmregs (6 .. 13) ]}
@@ -712,15 +769,15 @@
vshufi64x2 \$0, $GFPOLY, $GFPOLY, $GFPOLY
# Load the lowest set of key powers.
- vmovdqu8 $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1
+ vmovdqu8 $OFFSETOFEND_H_POWERS-1*$VL($HTABLE), $H_POW1
cmp \$4*$VL-1, $AADLEN
jbe .Laad_loop_1x$local_label_suffix
# AADLEN >= 4*VL. Load the higher key powers.
- vmovdqu8 $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4
- vmovdqu8 $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3
- vmovdqu8 $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2
+ vmovdqu8 $OFFSETOFEND_H_POWERS-4*$VL($HTABLE), $H_POW4
+ vmovdqu8 $OFFSETOFEND_H_POWERS-3*$VL($HTABLE), $H_POW3
+ vmovdqu8 $OFFSETOFEND_H_POWERS-2*$VL($HTABLE), $H_POW2
# Update GHASH with 4*VL bytes of AAD at a time.
.Laad_loop_4x$local_label_suffix:
@@ -728,7 +785,9 @@
vmovdqu8 1*$VL($AAD), $GHASHDATA1
vmovdqu8 2*$VL($AAD), $GHASHDATA2
vmovdqu8 3*$VL($AAD), $GHASHDATA3
- @{[ _ghash_4x ]}
+ @{[ _ghash_4x $BSWAP_MASK, @GHASHDATA, @GHASHDATA_XMM, $H_POW4, $H_POW3,
+ $H_POW2, $H_POW1, $GFPOLY, $GHASHTMP0, $GHASHTMP1,
+ $GHASHTMP2, $GHASH_ACC, $GHASH_ACC_XMM ]}
sub \$-4*$VL, $AAD # shorter than 'add 4*VL' when VL=32
add \$-4*$VL, $AADLEN
cmp \$4*$VL-1, $AADLEN
@@ -759,7 +818,7 @@
.Laad_blockbyblock$local_label_suffix:
test $AADLEN, $AADLEN
jz .Laad_done$local_label_suffix
- vmovdqu $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1_XMM
+ vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1_XMM
.Laad_loop_blockbyblock$local_label_suffix:
vmovdqu ($AAD), $GHASHDATA0_XMM
vpshufb $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM
@@ -778,69 +837,72 @@
return $code;
}
-# Do one non-last round of AES encryption on the counter blocks in V0-V3 using
-# the round key that has been broadcast to all 128-bit lanes of \round_key.
+# Do one non-last round of AES encryption on the counter blocks in aesdata[0-3]
+# using the round key that has been broadcast to all 128-bit lanes of round_key.
sub _vaesenc_4x {
- my ($round_key) = @_;
+ my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_;
return <<___;
- vaesenc $round_key, $V0, $V0
- vaesenc $round_key, $V1, $V1
- vaesenc $round_key, $V2, $V2
- vaesenc $round_key, $V3, $V3
+ vaesenc $round_key, $aesdata0, $aesdata0
+ vaesenc $round_key, $aesdata1, $aesdata1
+ vaesenc $round_key, $aesdata2, $aesdata2
+ vaesenc $round_key, $aesdata3, $aesdata3
___
}
# Start the AES encryption of four vectors of counter blocks.
sub _ctr_begin_4x {
+ my (
+ $le_ctr, $le_ctr_inc, $bswap_mask, $rndkey0,
+ $aesdata0, $aesdata1, $aesdata2, $aesdata3
+ ) = @_;
return <<___;
- # Increment LE_CTR four times to generate four vectors of little-endian
- # counter blocks, swap each to big-endian, and store them in V0-V3.
- vpshufb $BSWAP_MASK, $LE_CTR, $V0
- vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR
- vpshufb $BSWAP_MASK, $LE_CTR, $V1
- vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR
- vpshufb $BSWAP_MASK, $LE_CTR, $V2
- vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR
- vpshufb $BSWAP_MASK, $LE_CTR, $V3
- vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR
+ # Increment le_ctr four times to generate four vectors of little-endian
+ # counter blocks, swap each to big-endian, and store them in aesdata[0-3].
+ vpshufb $bswap_mask, $le_ctr, $aesdata0
+ vpaddd $le_ctr_inc, $le_ctr, $le_ctr
+ vpshufb $bswap_mask, $le_ctr, $aesdata1
+ vpaddd $le_ctr_inc, $le_ctr, $le_ctr
+ vpshufb $bswap_mask, $le_ctr, $aesdata2
+ vpaddd $le_ctr_inc, $le_ctr, $le_ctr
+ vpshufb $bswap_mask, $le_ctr, $aesdata3
+ vpaddd $le_ctr_inc, $le_ctr, $le_ctr
# AES "round zero": XOR in the zero-th round key.
- vpxord $RNDKEY0, $V0, $V0
- vpxord $RNDKEY0, $V1, $V1
- vpxord $RNDKEY0, $V2, $V2
- vpxord $RNDKEY0, $V3, $V3
+ vpxord $rndkey0, $aesdata0, $aesdata0
+ vpxord $rndkey0, $aesdata1, $aesdata1
+ vpxord $rndkey0, $aesdata2, $aesdata2
+ vpxord $rndkey0, $aesdata3, $aesdata3
___
}
-# Do the last AES round for four vectors of counter blocks V0-V3, XOR source
-# data with the resulting keystream, and write the result to DST and
-# GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.)
+# Do the last AES round for four vectors of counter blocks, XOR four vectors of
+# source data with the resulting keystream blocks, and write the result to the
+# destination buffer and ghashdata[0-3]. The implementation differs slightly as
+# it takes advantage of the property vaesenclast(key, a) ^ b ==
+# vaesenclast(key ^ b, a) to reduce latency, but it has the same effect.
sub _aesenclast_and_xor_4x {
+ my (
+ $src, $dst, $rndkeylast, $aesdata0,
+ $aesdata1, $aesdata2, $aesdata3, $ghashdata0,
+ $ghashdata1, $ghashdata2, $ghashdata3
+ ) = @_;
return <<___;
- # XOR the source data with the last round key, saving the result in
- # GHASHDATA[0-3]. This reduces latency by taking advantage of the
- # property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
- vpxord 0*$VL($SRC), $RNDKEYLAST, $GHASHDATA0
- vpxord 1*$VL($SRC), $RNDKEYLAST, $GHASHDATA1
- vpxord 2*$VL($SRC), $RNDKEYLAST, $GHASHDATA2
- vpxord 3*$VL($SRC), $RNDKEYLAST, $GHASHDATA3
-
- # Do the last AES round. This handles the XOR with the source data
- # too, as per the optimization described above.
- vaesenclast $GHASHDATA0, $V0, $GHASHDATA0
- vaesenclast $GHASHDATA1, $V1, $GHASHDATA1
- vaesenclast $GHASHDATA2, $V2, $GHASHDATA2
- vaesenclast $GHASHDATA3, $V3, $GHASHDATA3
-
- # Store the en/decrypted data to DST.
- vmovdqu8 $GHASHDATA0, 0*$VL($DST)
- vmovdqu8 $GHASHDATA1, 1*$VL($DST)
- vmovdqu8 $GHASHDATA2, 2*$VL($DST)
- vmovdqu8 $GHASHDATA3, 3*$VL($DST)
+ vpxord 0*$VL($src), $rndkeylast, $ghashdata0
+ vpxord 1*$VL($src), $rndkeylast, $ghashdata1
+ vpxord 2*$VL($src), $rndkeylast, $ghashdata2
+ vpxord 3*$VL($src), $rndkeylast, $ghashdata3
+ vaesenclast $ghashdata0, $aesdata0, $ghashdata0
+ vaesenclast $ghashdata1, $aesdata1, $ghashdata1
+ vaesenclast $ghashdata2, $aesdata2, $ghashdata2
+ vaesenclast $ghashdata3, $aesdata3, $ghashdata3
+ vmovdqu8 $ghashdata0, 0*$VL($dst)
+ vmovdqu8 $ghashdata1, 1*$VL($dst)
+ vmovdqu8 $ghashdata2, 2*$VL($dst)
+ vmovdqu8 $ghashdata3, 3*$VL($dst)
___
}
-$g_update_macro_expansion_count = 0;
+my $g_update_macro_expansion_count = 0;
# void aes_gcm_{enc,dec}_update_##suffix(const uint8_t *in, uint8_t *out,
# size_t len, const AES_KEY *key,
@@ -868,60 +930,64 @@
# 32-bit word of the counter is incremented, following the GCM standard.
sub _aes_gcm_update {
my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count;
-
- my ($enc) = @_;
-
- my $code = "";
+ my ($enc) = @_;
+ my $code = "";
# Function arguments
- ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $H_POWERS, $GHASH_ACC_PTR ) =
- $win64
+ my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR )
+ = $win64
? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" )
: ( @argregs[ 0 .. 5 ], "%r12" );
- # Additional local variables
-
+ # Additional local variables.
# %rax, %k1, and %k2 are used as temporary registers. BE_CTR_PTR is
# also available as a temporary register after the counter is loaded.
# AES key length in bytes
- ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" );
+ my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" );
# Pointer to the last AES round key for the chosen AES variant
- $RNDKEYLAST_PTR = "%r11";
+ my $RNDKEYLAST_PTR = "%r11";
- # In the main loop, V0-V3 are used as AES input and output. Elsewhere
- # they are used as temporary registers.
+ # AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
+ my ( $AESDATA0, $AESDATA0_XMM ) = ( $V0, "%xmm0" );
+ my ( $AESDATA1, $AESDATA1_XMM ) = ( $V1, "%xmm1" );
+ my ( $AESDATA2, $AESDATA2_XMM ) = ( $V2, "%xmm2" );
+ my ( $AESDATA3, $AESDATA3_XMM ) = ( $V3, "%xmm3" );
+ my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 );
# GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
- ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V4, "%xmm4" );
- ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V5, "%xmm5" );
- ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V6, "%xmm6" );
- ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V7, "%xmm7" );
+ my ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V4, "%xmm4" );
+ my ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V5, "%xmm5" );
+ my ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V6, "%xmm6" );
+ my ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V7, "%xmm7" );
+ my @GHASHDATA = ( $GHASHDATA0, $GHASHDATA1, $GHASHDATA2, $GHASHDATA3 );
+ my @GHASHDATA_XMM =
+ ( $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM, $GHASHDATA3_XMM );
# BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
# using vpshufb, copied to all 128-bit lanes.
- ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V8, "%xmm8" );
+ my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V8, "%xmm8" );
# RNDKEY temporarily holds the next AES round key.
- $RNDKEY = $V9;
+ my $RNDKEY = $V9;
# GHASH_ACC is the accumulator variable for GHASH. When fully reduced,
# only the lowest 128-bit lane can be nonzero. When not fully reduced,
# more than one lane may be used, and they need to be XOR'd together.
- ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( $V10, "%xmm10" );
+ my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( $V10, "%xmm10" );
# LE_CTR_INC is the vector of 32-bit words that need to be added to a
# vector of little-endian counter blocks to advance it forwards.
- $LE_CTR_INC = $V11;
+ my $LE_CTR_INC = $V11;
# LE_CTR contains the next set of little-endian counter blocks.
- $LE_CTR = $V12;
+ my $LE_CTR = $V12;
# RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
# copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,
# RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
- (
+ my (
$RNDKEY0, $RNDKEYLAST, $RNDKEY_M9, $RNDKEY_M8,
$RNDKEY_M7, $RNDKEY_M6, $RNDKEY_M5, $RNDKEY_M4,
$RNDKEY_M3, $RNDKEY_M2, $RNDKEY_M1
@@ -930,20 +996,27 @@
# GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These
# cannot coincide with anything used for AES encryption, since for
# performance reasons GHASH and AES encryption are interleaved.
- ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V24, $V25, $V26 );
+ my ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V24, $V25, $V26 );
# H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The
# descending numbering reflects the order of the key powers.
- ( $H_POW4, $H_POW3, $H_POW2, $H_POW1 ) = ( $V27, $V28, $V29, $V30 );
+ my ( $H_POW4, $H_POW3, $H_POW2, $H_POW1 ) = ( $V27, $V28, $V29, $V30 );
# GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
- $GFPOLY = $V31;
+ my $GFPOLY = $V31;
+
+ my @ghash_4x_args = (
+ $BSWAP_MASK, @GHASHDATA, @GHASHDATA_XMM, $H_POW4,
+ $H_POW3, $H_POW2, $H_POW1, $GFPOLY,
+ $GHASHTMP0, $GHASHTMP1, $GHASHTMP2, $GHASH_ACC,
+ $GHASH_ACC_XMM
+ );
if ($win64) {
$code .= <<___;
- @{[ _save_gpregs $BE_CTR_PTR, $H_POWERS, $GHASH_ACC_PTR ]}
+ @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]}
mov 64(%rsp), $BE_CTR_PTR # arg5
- mov 72(%rsp), $H_POWERS # arg6
+ mov 72(%rsp), $HTABLE # arg6
mov 80(%rsp), $GHASH_ACC_PTR # arg7
@{[ _save_xmmregs (6 .. 15) ]}
.seh_endprologue
@@ -1000,10 +1073,10 @@
jbe .Lcrypt_loop_4x_done$local_label_suffix
# Load powers of the hash key.
- vmovdqu8 $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4
- vmovdqu8 $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3
- vmovdqu8 $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2
- vmovdqu8 $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1
+ vmovdqu8 $OFFSETOFEND_H_POWERS-4*$VL($HTABLE), $H_POW4
+ vmovdqu8 $OFFSETOFEND_H_POWERS-3*$VL($HTABLE), $H_POW3
+ vmovdqu8 $OFFSETOFEND_H_POWERS-2*$VL($HTABLE), $H_POW2
+ vmovdqu8 $OFFSETOFEND_H_POWERS-1*$VL($HTABLE), $H_POW1
___
# Main loop: en/decrypt and hash 4 vectors at a time.
@@ -1025,15 +1098,15 @@
$code .= <<___;
# Encrypt the first 4 vectors of plaintext blocks. Leave the resulting
# ciphertext in GHASHDATA[0-3] for GHASH.
- @{[ _ctr_begin_4x ]}
+ @{[ _ctr_begin_4x $LE_CTR, $LE_CTR_INC, $BSWAP_MASK, $RNDKEY0, @AESDATA ]}
lea 16($AESKEY), %rax
.Lvaesenc_loop_first_4_vecs$local_label_suffix:
vbroadcasti32x4 (%rax), $RNDKEY
- @{[ _vaesenc_4x $RNDKEY ]}
+ @{[ _vaesenc_4x $RNDKEY, @AESDATA ]}
add \$16, %rax
cmp %rax, $RNDKEYLAST_PTR
jne .Lvaesenc_loop_first_4_vecs$local_label_suffix
- @{[ _aesenclast_and_xor_4x ]}
+ @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, @GHASHDATA ]}
sub \$-4*$VL, $SRC # shorter than 'add 4*VL' when VL=32
sub \$-4*$VL, $DST
add \$-4*$VL, $DATALEN
@@ -1042,14 +1115,18 @@
___
}
- # Cache as many additional AES round keys as possible.
- for my $i ( reverse 1 .. 9 ) {
- $code .= <<___;
- vbroadcasti32x4 -$i*16($RNDKEYLAST_PTR), ${"RNDKEY_M$i"}
-___
- }
-
$code .= <<___;
+ # Cache as many additional AES round keys as possible.
+ vbroadcasti32x4 -9*16($RNDKEYLAST_PTR), $RNDKEY_M9
+ vbroadcasti32x4 -8*16($RNDKEYLAST_PTR), $RNDKEY_M8
+ vbroadcasti32x4 -7*16($RNDKEYLAST_PTR), $RNDKEY_M7
+ vbroadcasti32x4 -6*16($RNDKEYLAST_PTR), $RNDKEY_M6
+ vbroadcasti32x4 -5*16($RNDKEYLAST_PTR), $RNDKEY_M5
+ vbroadcasti32x4 -4*16($RNDKEYLAST_PTR), $RNDKEY_M4
+ vbroadcasti32x4 -3*16($RNDKEYLAST_PTR), $RNDKEY_M3
+ vbroadcasti32x4 -2*16($RNDKEYLAST_PTR), $RNDKEY_M2
+ vbroadcasti32x4 -1*16($RNDKEYLAST_PTR), $RNDKEY_M1
+
.Lcrypt_loop_4x$local_label_suffix:
___
@@ -1066,20 +1143,20 @@
$code .= <<___;
# Start the AES encryption of the counter blocks.
- @{[ _ctr_begin_4x ]}
+ @{[ _ctr_begin_4x $LE_CTR, $LE_CTR_INC, $BSWAP_MASK, $RNDKEY0, @AESDATA ]}
cmp \$24, $AESKEYLEN
jl .Laes128$local_label_suffix
je .Laes192$local_label_suffix
# AES-256
vbroadcasti32x4 -13*16($RNDKEYLAST_PTR), $RNDKEY
- @{[ _vaesenc_4x $RNDKEY ]}
+ @{[ _vaesenc_4x $RNDKEY, @AESDATA ]}
vbroadcasti32x4 -12*16($RNDKEYLAST_PTR), $RNDKEY
- @{[ _vaesenc_4x $RNDKEY ]}
+ @{[ _vaesenc_4x $RNDKEY, @AESDATA ]}
.Laes192$local_label_suffix:
vbroadcasti32x4 -11*16($RNDKEYLAST_PTR), $RNDKEY
- @{[ _vaesenc_4x $RNDKEY ]}
+ @{[ _vaesenc_4x $RNDKEY, @AESDATA ]}
vbroadcasti32x4 -10*16($RNDKEYLAST_PTR), $RNDKEY
- @{[ _vaesenc_4x $RNDKEY ]}
+ @{[ _vaesenc_4x $RNDKEY, @AESDATA ]}
.Laes128$local_label_suffix:
___
@@ -1090,17 +1167,31 @@
$code .= "prefetcht0 512+$i($SRC)\n";
}
- # Finish the AES encryption of the counter blocks in V0-V3, interleaved
- # with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
- for my $i ( reverse 1 .. 9 ) {
- $code .= <<___;
- @{[ _ghash_step_4x (9 - $i) ]}
- @{[ _vaesenc_4x ${"RNDKEY_M$i"} ]}
-___
- }
$code .= <<___;
- @{[ _ghash_step_4x 9 ]}
- @{[ _aesenclast_and_xor_4x ]}
+ # Finish the AES encryption of the counter blocks in AESDATA[0-3],
+ # interleaved with the GHASH update of the ciphertext blocks in
+ # GHASHDATA[0-3].
+ @{[ _ghash_step_4x 0, @ghash_4x_args ]}
+ @{[ _vaesenc_4x $RNDKEY_M9, @AESDATA ]}
+ @{[ _ghash_step_4x 1, @ghash_4x_args ]}
+ @{[ _vaesenc_4x $RNDKEY_M8, @AESDATA ]}
+ @{[ _ghash_step_4x 2, @ghash_4x_args ]}
+ @{[ _vaesenc_4x $RNDKEY_M7, @AESDATA ]}
+ @{[ _ghash_step_4x 3, @ghash_4x_args ]}
+ @{[ _vaesenc_4x $RNDKEY_M6, @AESDATA ]}
+ @{[ _ghash_step_4x 4, @ghash_4x_args ]}
+ @{[ _vaesenc_4x $RNDKEY_M5, @AESDATA ]}
+ @{[ _ghash_step_4x 5, @ghash_4x_args ]}
+ @{[ _vaesenc_4x $RNDKEY_M4, @AESDATA ]}
+ @{[ _ghash_step_4x 6, @ghash_4x_args ]}
+ @{[ _vaesenc_4x $RNDKEY_M3, @AESDATA ]}
+ @{[ _ghash_step_4x 7, @ghash_4x_args ]}
+ @{[ _vaesenc_4x $RNDKEY_M2, @AESDATA ]}
+ @{[ _ghash_step_4x 8, @ghash_4x_args ]}
+ @{[ _vaesenc_4x $RNDKEY_M1, @AESDATA ]}
+
+ @{[ _ghash_step_4x 9, @ghash_4x_args ]}
+ @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, @GHASHDATA ]}
sub \$-4*$VL, $SRC # shorter than 'add 4*VL' when VL=32
sub \$-4*$VL, $DST
add \$-4*$VL, $DATALEN
@@ -1113,7 +1204,7 @@
# Update GHASH with the last set of ciphertext blocks.
$code .= <<___;
.Lghash_last_ciphertext_4x$local_label_suffix:
- @{[ _ghash_4x ]}
+ @{[ _ghash_4x @ghash_4x_args ]}
___
}
@@ -1147,7 +1238,7 @@
mov $DATALEN, %rax
neg %rax
and \$-16, %rax # -round_up(DATALEN, 16)
- lea $OFFSETOFEND_H_POWERS($H_POWERS,%rax), $POWERS_PTR
+ lea $OFFSETOFEND_H_POWERS($HTABLE,%rax), $POWERS_PTR
___
# Start collecting the unreduced GHASH intermediate value LO, MI, HI.
@@ -1166,29 +1257,29 @@
# Process a full vector of length VL.
# Encrypt a vector of counter blocks.
- vpshufb $BSWAP_MASK, $LE_CTR, $V0
+ vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0
vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR
- vpxord $RNDKEY0, $V0, $V0
+ vpxord $RNDKEY0, $AESDATA0, $AESDATA0
lea 16($AESKEY), %rax
.Lvaesenc_loop_tail_full_vec$local_label_suffix:
vbroadcasti32x4 (%rax), $RNDKEY
- vaesenc $RNDKEY, $V0, $V0
+ vaesenc $RNDKEY, $AESDATA0, $AESDATA0
add \$16, %rax
cmp %rax, $RNDKEYLAST_PTR
jne .Lvaesenc_loop_tail_full_vec$local_label_suffix
- vaesenclast $RNDKEYLAST, $V0, $V0
+ vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0
# XOR the data with the vector of keystream blocks.
- vmovdqu8 ($SRC), $V1
- vpxord $V1, $V0, $V0
- vmovdqu8 $V0, ($DST)
+ vmovdqu8 ($SRC), $AESDATA1
+ vpxord $AESDATA1, $AESDATA0, $AESDATA0
+ vmovdqu8 $AESDATA0, ($DST)
# Update GHASH with the ciphertext blocks, without reducing.
vmovdqu8 ($POWERS_PTR), $H_POW1
- vpshufb $BSWAP_MASK, @{[ $enc ? $V0 : $V1 ]}, $V0
- vpxord $GHASH_ACC, $V0, $V0
- @{[ _ghash_mul_noreduce $H_POW1, $V0, $LO, $MI, $HI, $GHASHDATA3,
- $V1, $V2, $V3 ]}
+ vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $AESDATA1 ]}, $AESDATA0
+ vpxord $GHASH_ACC, $AESDATA0, $AESDATA0
+ @{[ _ghash_mul_noreduce $H_POW1, $AESDATA0, $LO, $MI, $HI,
+ $GHASHDATA3, $AESDATA1, $AESDATA2, $AESDATA3 ]}
vpxor $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
add \$$VL, $POWERS_PTR
@@ -1217,21 +1308,21 @@
# Encrypt one last vector of counter blocks. This does not need to be
# masked. The counter does not need to be incremented here.
- vpshufb $BSWAP_MASK, $LE_CTR, $V0
- vpxord $RNDKEY0, $V0, $V0
+ vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0
+ vpxord $RNDKEY0, $AESDATA0, $AESDATA0
lea 16($AESKEY), %rax
.Lvaesenc_loop_tail_partialvec$local_label_suffix:
vbroadcasti32x4 (%rax), $RNDKEY
- vaesenc $RNDKEY, $V0, $V0
+ vaesenc $RNDKEY, $AESDATA0, $AESDATA0
add \$16, %rax
cmp %rax, $RNDKEYLAST_PTR
jne .Lvaesenc_loop_tail_partialvec$local_label_suffix
- vaesenclast $RNDKEYLAST, $V0, $V0
+ vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0
# XOR the data with the appropriate number of keystream bytes.
- vmovdqu8 ($SRC), $V1\{%k1}{z}
- vpxord $V1, $V0, $V0
- vmovdqu8 $V0, ($DST){%k1}
+ vmovdqu8 ($SRC), $AESDATA1\{%k1}{z}
+ vpxord $AESDATA1, $AESDATA0, $AESDATA0
+ vmovdqu8 $AESDATA0, ($DST){%k1}
# Update GHASH with the ciphertext block(s), without reducing.
#
@@ -1246,17 +1337,17 @@
# they're multiplied with are also all-zeroes. Therefore they just add
# 0 * 0 = 0 to the final GHASH result, which makes no difference.
vmovdqu8 ($POWERS_PTR), $H_POW1\{%k2}{z}
- @{[ $enc ? "vmovdqu8 $V0, $V1\{%k1}{z}" : "" ]}
- vpshufb $BSWAP_MASK, $V1, $V0
- vpxord $GHASH_ACC, $V0, $V0
- @{[ _ghash_mul_noreduce $H_POW1, $V0, $LO, $MI, $HI, $GHASHDATA3,
- $V1, $V2, $V3 ]}
+ @{[ $enc ? "vmovdqu8 $AESDATA0, $AESDATA1\{%k1}{z}" : "" ]}
+ vpshufb $BSWAP_MASK, $AESDATA1, $AESDATA0
+ vpxord $GHASH_ACC, $AESDATA0, $AESDATA0
+ @{[ _ghash_mul_noreduce $H_POW1, $AESDATA0, $LO, $MI, $HI,
+ $GHASHDATA3, $AESDATA1, $AESDATA2, $AESDATA3 ]}
.Lreduce$local_label_suffix:
# Finally, do the GHASH reduction.
- @{[ _ghash_reduce $LO, $MI, $HI, $GFPOLY, $V0 ]}
+ @{[ _ghash_reduce $LO, $MI, $HI, $GFPOLY, $AESDATA0 ]}
@{[ _horizontal_xor $HI, $HI_XMM, $GHASH_ACC_XMM,
- "%xmm0", "%xmm1", "%xmm2" ]}
+ $AESDATA0_XMM, $AESDATA1_XMM, $AESDATA2_XMM ]}
.Ldone$local_label_suffix:
# Store the updated GHASH accumulator back to memory.
@@ -1271,7 +1362,7 @@
# void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
$code .= _begin_func "gcm_gmult_vpclmulqdq_avx10", 1;
{
- my ( $GHASH_ACC_PTR, $H_POWERS ) = @argregs[ 0 .. 1 ];
+ my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ];
my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
map( "%xmm$_", ( 0 .. 6 ) );
@@ -1281,7 +1372,7 @@
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC
vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK
- vmovdqu $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1
+ vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
vmovdqu .Lgfpoly(%rip), $GFPOLY
vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-apple.S b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
index be66605..a7ec87e 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-apple.S
+++ b/gen/bcm/aes-gcm-avx10-x86_64-apple.S
@@ -428,22 +428,14 @@
addq $16,%rax
cmpq %rax,%r11
jne L$vaesenc_loop_first_4_vecs__func1
-
-
-
vpxord 0(%rdi),%zmm14,%zmm4
vpxord 64(%rdi),%zmm14,%zmm5
vpxord 128(%rdi),%zmm14,%zmm6
vpxord 192(%rdi),%zmm14,%zmm7
-
-
-
vaesenclast %zmm4,%zmm0,%zmm4
vaesenclast %zmm5,%zmm1,%zmm5
vaesenclast %zmm6,%zmm2,%zmm6
vaesenclast %zmm7,%zmm3,%zmm7
-
-
vmovdqu8 %zmm4,0(%rsi)
vmovdqu8 %zmm5,64(%rsi)
vmovdqu8 %zmm6,128(%rsi)
@@ -454,6 +446,7 @@
addq $-256,%rdx
cmpq $256-1,%rdx
jbe L$ghash_last_ciphertext_4x__func1
+
vbroadcasti32x4 -144(%r11),%zmm15
vbroadcasti32x4 -128(%r11),%zmm16
vbroadcasti32x4 -112(%r11),%zmm17
@@ -463,6 +456,7 @@
vbroadcasti32x4 -48(%r11),%zmm21
vbroadcasti32x4 -32(%r11),%zmm22
vbroadcasti32x4 -16(%r11),%zmm23
+
L$crypt_loop_4x__func1:
@@ -516,6 +510,9 @@
prefetcht0 512+64(%rdi)
prefetcht0 512+128(%rdi)
prefetcht0 512+192(%rdi)
+
+
+
vpshufb %zmm8,%zmm4,%zmm4
vpxord %zmm10,%zmm4,%zmm4
vpshufb %zmm8,%zmm5,%zmm5
@@ -605,28 +602,21 @@
vaesenc %zmm23,%zmm2,%zmm2
vaesenc %zmm23,%zmm3,%zmm3
+
vextracti32x4 $1,%zmm10,%xmm4
vextracti32x4 $2,%zmm10,%xmm5
vextracti32x4 $3,%zmm10,%xmm6
vpxord %xmm4,%xmm10,%xmm10
vpternlogd $0x96,%xmm5,%xmm6,%xmm10
-
-
-
vpxord 0(%rdi),%zmm14,%zmm4
vpxord 64(%rdi),%zmm14,%zmm5
vpxord 128(%rdi),%zmm14,%zmm6
vpxord 192(%rdi),%zmm14,%zmm7
-
-
-
vaesenclast %zmm4,%zmm0,%zmm4
vaesenclast %zmm5,%zmm1,%zmm5
vaesenclast %zmm6,%zmm2,%zmm6
vaesenclast %zmm7,%zmm3,%zmm7
-
-
vmovdqu8 %zmm4,0(%rsi)
vmovdqu8 %zmm5,64(%rsi)
vmovdqu8 %zmm6,128(%rsi)
@@ -895,6 +885,7 @@
vmovdqu8 256-192(%r9),%zmm28
vmovdqu8 256-128(%r9),%zmm29
vmovdqu8 256-64(%r9),%zmm30
+
vbroadcasti32x4 -144(%r11),%zmm15
vbroadcasti32x4 -128(%r11),%zmm16
vbroadcasti32x4 -112(%r11),%zmm17
@@ -904,6 +895,7 @@
vbroadcasti32x4 -48(%r11),%zmm21
vbroadcasti32x4 -32(%r11),%zmm22
vbroadcasti32x4 -16(%r11),%zmm23
+
L$crypt_loop_4x__func2:
vmovdqu8 0(%rdi),%zmm4
vmovdqu8 64(%rdi),%zmm5
@@ -961,6 +953,9 @@
prefetcht0 512+64(%rdi)
prefetcht0 512+128(%rdi)
prefetcht0 512+192(%rdi)
+
+
+
vpshufb %zmm8,%zmm4,%zmm4
vpxord %zmm10,%zmm4,%zmm4
vpshufb %zmm8,%zmm5,%zmm5
@@ -1050,28 +1045,21 @@
vaesenc %zmm23,%zmm2,%zmm2
vaesenc %zmm23,%zmm3,%zmm3
+
vextracti32x4 $1,%zmm10,%xmm4
vextracti32x4 $2,%zmm10,%xmm5
vextracti32x4 $3,%zmm10,%xmm6
vpxord %xmm4,%xmm10,%xmm10
vpternlogd $0x96,%xmm5,%xmm6,%xmm10
-
-
-
vpxord 0(%rdi),%zmm14,%zmm4
vpxord 64(%rdi),%zmm14,%zmm5
vpxord 128(%rdi),%zmm14,%zmm6
vpxord 192(%rdi),%zmm14,%zmm7
-
-
-
vaesenclast %zmm4,%zmm0,%zmm4
vaesenclast %zmm5,%zmm1,%zmm5
vaesenclast %zmm6,%zmm2,%zmm6
vaesenclast %zmm7,%zmm3,%zmm7
-
-
vmovdqu8 %zmm4,0(%rsi)
vmovdqu8 %zmm5,64(%rsi)
vmovdqu8 %zmm6,128(%rsi)
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-linux.S b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
index b525623..0ffc7c7 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-linux.S
+++ b/gen/bcm/aes-gcm-avx10-x86_64-linux.S
@@ -430,22 +430,14 @@
addq $16,%rax
cmpq %rax,%r11
jne .Lvaesenc_loop_first_4_vecs__func1
-
-
-
vpxord 0(%rdi),%zmm14,%zmm4
vpxord 64(%rdi),%zmm14,%zmm5
vpxord 128(%rdi),%zmm14,%zmm6
vpxord 192(%rdi),%zmm14,%zmm7
-
-
-
vaesenclast %zmm4,%zmm0,%zmm4
vaesenclast %zmm5,%zmm1,%zmm5
vaesenclast %zmm6,%zmm2,%zmm6
vaesenclast %zmm7,%zmm3,%zmm7
-
-
vmovdqu8 %zmm4,0(%rsi)
vmovdqu8 %zmm5,64(%rsi)
vmovdqu8 %zmm6,128(%rsi)
@@ -456,6 +448,7 @@
addq $-256,%rdx
cmpq $256-1,%rdx
jbe .Lghash_last_ciphertext_4x__func1
+
vbroadcasti32x4 -144(%r11),%zmm15
vbroadcasti32x4 -128(%r11),%zmm16
vbroadcasti32x4 -112(%r11),%zmm17
@@ -465,6 +458,7 @@
vbroadcasti32x4 -48(%r11),%zmm21
vbroadcasti32x4 -32(%r11),%zmm22
vbroadcasti32x4 -16(%r11),%zmm23
+
.Lcrypt_loop_4x__func1:
@@ -518,6 +512,9 @@
prefetcht0 512+64(%rdi)
prefetcht0 512+128(%rdi)
prefetcht0 512+192(%rdi)
+
+
+
vpshufb %zmm8,%zmm4,%zmm4
vpxord %zmm10,%zmm4,%zmm4
vpshufb %zmm8,%zmm5,%zmm5
@@ -607,28 +604,21 @@
vaesenc %zmm23,%zmm2,%zmm2
vaesenc %zmm23,%zmm3,%zmm3
+
vextracti32x4 $1,%zmm10,%xmm4
vextracti32x4 $2,%zmm10,%xmm5
vextracti32x4 $3,%zmm10,%xmm6
vpxord %xmm4,%xmm10,%xmm10
vpternlogd $0x96,%xmm5,%xmm6,%xmm10
-
-
-
vpxord 0(%rdi),%zmm14,%zmm4
vpxord 64(%rdi),%zmm14,%zmm5
vpxord 128(%rdi),%zmm14,%zmm6
vpxord 192(%rdi),%zmm14,%zmm7
-
-
-
vaesenclast %zmm4,%zmm0,%zmm4
vaesenclast %zmm5,%zmm1,%zmm5
vaesenclast %zmm6,%zmm2,%zmm6
vaesenclast %zmm7,%zmm3,%zmm7
-
-
vmovdqu8 %zmm4,0(%rsi)
vmovdqu8 %zmm5,64(%rsi)
vmovdqu8 %zmm6,128(%rsi)
@@ -899,6 +889,7 @@
vmovdqu8 256-192(%r9),%zmm28
vmovdqu8 256-128(%r9),%zmm29
vmovdqu8 256-64(%r9),%zmm30
+
vbroadcasti32x4 -144(%r11),%zmm15
vbroadcasti32x4 -128(%r11),%zmm16
vbroadcasti32x4 -112(%r11),%zmm17
@@ -908,6 +899,7 @@
vbroadcasti32x4 -48(%r11),%zmm21
vbroadcasti32x4 -32(%r11),%zmm22
vbroadcasti32x4 -16(%r11),%zmm23
+
.Lcrypt_loop_4x__func2:
vmovdqu8 0(%rdi),%zmm4
vmovdqu8 64(%rdi),%zmm5
@@ -965,6 +957,9 @@
prefetcht0 512+64(%rdi)
prefetcht0 512+128(%rdi)
prefetcht0 512+192(%rdi)
+
+
+
vpshufb %zmm8,%zmm4,%zmm4
vpxord %zmm10,%zmm4,%zmm4
vpshufb %zmm8,%zmm5,%zmm5
@@ -1054,28 +1049,21 @@
vaesenc %zmm23,%zmm2,%zmm2
vaesenc %zmm23,%zmm3,%zmm3
+
vextracti32x4 $1,%zmm10,%xmm4
vextracti32x4 $2,%zmm10,%xmm5
vextracti32x4 $3,%zmm10,%xmm6
vpxord %xmm4,%xmm10,%xmm10
vpternlogd $0x96,%xmm5,%xmm6,%xmm10
-
-
-
vpxord 0(%rdi),%zmm14,%zmm4
vpxord 64(%rdi),%zmm14,%zmm5
vpxord 128(%rdi),%zmm14,%zmm6
vpxord 192(%rdi),%zmm14,%zmm7
-
-
-
vaesenclast %zmm4,%zmm0,%zmm4
vaesenclast %zmm5,%zmm1,%zmm5
vaesenclast %zmm6,%zmm2,%zmm6
vaesenclast %zmm7,%zmm3,%zmm7
-
-
vmovdqu8 %zmm4,0(%rsi)
vmovdqu8 %zmm5,64(%rsi)
vmovdqu8 %zmm6,128(%rsi)
diff --git a/gen/bcm/aes-gcm-avx10-x86_64-win.asm b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
index 733ae72..051a530 100644
--- a/gen/bcm/aes-gcm-avx10-x86_64-win.asm
+++ b/gen/bcm/aes-gcm-avx10-x86_64-win.asm
@@ -495,22 +495,14 @@
add rax,16
cmp r11,rax
jne NEAR $L$vaesenc_loop_first_4_vecs__func1
-
-
-
vpxord zmm4,zmm14,ZMMWORD[rcx]
vpxord zmm5,zmm14,ZMMWORD[64+rcx]
vpxord zmm6,zmm14,ZMMWORD[128+rcx]
vpxord zmm7,zmm14,ZMMWORD[192+rcx]
-
-
-
vaesenclast zmm4,zmm0,zmm4
vaesenclast zmm5,zmm1,zmm5
vaesenclast zmm6,zmm2,zmm6
vaesenclast zmm7,zmm3,zmm7
-
-
vmovdqu8 ZMMWORD[rdx],zmm4
vmovdqu8 ZMMWORD[64+rdx],zmm5
vmovdqu8 ZMMWORD[128+rdx],zmm6
@@ -521,6 +513,7 @@
add r8,-4*64
cmp r8,4*64-1
jbe NEAR $L$ghash_last_ciphertext_4x__func1
+
vbroadcasti32x4 zmm15,ZMMWORD[((-144))+r11]
vbroadcasti32x4 zmm16,ZMMWORD[((-128))+r11]
vbroadcasti32x4 zmm17,ZMMWORD[((-112))+r11]
@@ -530,6 +523,7 @@
vbroadcasti32x4 zmm21,ZMMWORD[((-48))+r11]
vbroadcasti32x4 zmm22,ZMMWORD[((-32))+r11]
vbroadcasti32x4 zmm23,ZMMWORD[((-16))+r11]
+
$L$crypt_loop_4x__func1:
@@ -583,6 +577,9 @@
prefetcht0 [((512+64))+rcx]
prefetcht0 [((512+128))+rcx]
prefetcht0 [((512+192))+rcx]
+
+
+
vpshufb zmm4,zmm4,zmm8
vpxord zmm4,zmm4,zmm10
vpshufb zmm5,zmm5,zmm8
@@ -672,28 +669,21 @@
vaesenc zmm2,zmm2,zmm23
vaesenc zmm3,zmm3,zmm23
+
vextracti32x4 xmm4,zmm10,1
vextracti32x4 xmm5,zmm10,2
vextracti32x4 xmm6,zmm10,3
vpxord xmm10,xmm10,xmm4
vpternlogd xmm10,xmm6,xmm5,0x96
-
-
-
vpxord zmm4,zmm14,ZMMWORD[rcx]
vpxord zmm5,zmm14,ZMMWORD[64+rcx]
vpxord zmm6,zmm14,ZMMWORD[128+rcx]
vpxord zmm7,zmm14,ZMMWORD[192+rcx]
-
-
-
vaesenclast zmm4,zmm0,zmm4
vaesenclast zmm5,zmm1,zmm5
vaesenclast zmm6,zmm2,zmm6
vaesenclast zmm7,zmm3,zmm7
-
-
vmovdqu8 ZMMWORD[rdx],zmm4
vmovdqu8 ZMMWORD[64+rdx],zmm5
vmovdqu8 ZMMWORD[128+rdx],zmm6
@@ -1003,6 +993,7 @@
vmovdqu8 zmm28,ZMMWORD[((256-192))+rdi]
vmovdqu8 zmm29,ZMMWORD[((256-128))+rdi]
vmovdqu8 zmm30,ZMMWORD[((256-64))+rdi]
+
vbroadcasti32x4 zmm15,ZMMWORD[((-144))+r11]
vbroadcasti32x4 zmm16,ZMMWORD[((-128))+r11]
vbroadcasti32x4 zmm17,ZMMWORD[((-112))+r11]
@@ -1012,6 +1003,7 @@
vbroadcasti32x4 zmm21,ZMMWORD[((-48))+r11]
vbroadcasti32x4 zmm22,ZMMWORD[((-32))+r11]
vbroadcasti32x4 zmm23,ZMMWORD[((-16))+r11]
+
$L$crypt_loop_4x__func2:
vmovdqu8 zmm4,ZMMWORD[rcx]
vmovdqu8 zmm5,ZMMWORD[64+rcx]
@@ -1069,6 +1061,9 @@
prefetcht0 [((512+64))+rcx]
prefetcht0 [((512+128))+rcx]
prefetcht0 [((512+192))+rcx]
+
+
+
vpshufb zmm4,zmm4,zmm8
vpxord zmm4,zmm4,zmm10
vpshufb zmm5,zmm5,zmm8
@@ -1158,28 +1153,21 @@
vaesenc zmm2,zmm2,zmm23
vaesenc zmm3,zmm3,zmm23
+
vextracti32x4 xmm4,zmm10,1
vextracti32x4 xmm5,zmm10,2
vextracti32x4 xmm6,zmm10,3
vpxord xmm10,xmm10,xmm4
vpternlogd xmm10,xmm6,xmm5,0x96
-
-
-
vpxord zmm4,zmm14,ZMMWORD[rcx]
vpxord zmm5,zmm14,ZMMWORD[64+rcx]
vpxord zmm6,zmm14,ZMMWORD[128+rcx]
vpxord zmm7,zmm14,ZMMWORD[192+rcx]
-
-
-
vaesenclast zmm4,zmm0,zmm4
vaesenclast zmm5,zmm1,zmm5
vaesenclast zmm6,zmm2,zmm6
vaesenclast zmm7,zmm3,zmm7
-
-
vmovdqu8 ZMMWORD[rdx],zmm4
vmovdqu8 ZMMWORD[64+rdx],zmm5
vmovdqu8 ZMMWORD[128+rdx],zmm6