aes-gcm-avx10-x86_64.pl: fold _ghash_mul_step into _ghash_mul

Fold _ghash_mul_step into _ghash_mul, since the support for interleaving
a single-vector GHASH multiplication with other instructions is not
used.  It is used in the Linux port (for _aes_gcm_final), but it is not
used in the BoringSSL port.

No change to the generated assembly code.

Change-Id: I5a65ee490e814ca6390c4b968ab135091c344a98
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/77167
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
index 8a099d3..36aef35 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
@@ -256,13 +256,9 @@
     }
 }
 
-# The _ghash_mul_step macro does one step of GHASH multiplication of the
-# 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
-# reduced products in \dst.  \t0, \t1, and \t2 are temporary registers of the
-# same size as \a and \b.  To complete all steps, this must invoked with \i=0
-# through \i=9.  The division into steps allows users of this macro to
-# optionally interleave the computation with other instructions.  Users of this
-# macro must preserve the parameter registers across steps.
+# The _ghash_mul macro multiplies the 128-bit lanes of \a by the corresponding
+# 128-bit lanes of \b and stores the reduced products in \dst.  \t0, \t1, and
+# \t2 are temporary registers of the same size as \a and \b.
 #
 # The multiplications are done in GHASH's representation of the finite field
 # GF(2^128).  Elements of GF(2^128) are represented as binary polynomials
@@ -359,52 +355,21 @@
 #
 # Using Karatsuba multiplication instead of "schoolbook" multiplication
 # similarly would save a vpclmulqdq but does not seem to be worth it.
-sub _ghash_mul_step {
-    my ( $i, $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
-    if ( $i == 0 ) {
-        return "vpclmulqdq \$0x00, $a, $b, $t0\n" .    # LO = a_L * b_L
-          "vpclmulqdq \$0x01, $a, $b, $t1\n";          # MI_0 = a_L * b_H
-    }
-    elsif ( $i == 1 ) {
-        return "vpclmulqdq \$0x10, $a, $b, $t2\n";     # MI_1 = a_H * b_L
-    }
-    elsif ( $i == 2 ) {
-        return "vpxord $t2, $t1, $t1\n";               # MI = MI_0 + MI_1
-    }
-    elsif ( $i == 3 ) {
-        return
-          "vpclmulqdq \$0x01, $t0, $gfpoly, $t2\n";  # LO_L*(x^63 + x^62 + x^57)
-    }
-    elsif ( $i == 4 ) {
-        return "vpshufd \$0x4e, $t0, $t0\n";         # Swap halves of LO
-    }
-    elsif ( $i == 5 ) {
-        return "vpternlogd \$0x96, $t2, $t0, $t1\n";    # Fold LO into MI
-    }
-    elsif ( $i == 6 ) {
-        return "vpclmulqdq \$0x11, $a, $b, $dst\n";     # HI = a_H * b_H
-    }
-    elsif ( $i == 7 ) {
-        return
-          "vpclmulqdq \$0x01, $t1, $gfpoly, $t0\n";  # MI_L*(x^63 + x^62 + x^57)
-    }
-    elsif ( $i == 8 ) {
-        return "vpshufd \$0x4e, $t1, $t1\n";         # Swap halves of MI
-    }
-    elsif ( $i == 9 ) {
-        return "vpternlogd \$0x96, $t0, $t1, $dst\n";    # Fold MI into HI
-    }
-}
-
-# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
-# the reduced products in \dst.  See _ghash_mul_step for full explanation.
 sub _ghash_mul {
     my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
-    my $code = "";
-    for my $i ( 0 .. 9 ) {
-        $code .= _ghash_mul_step $i, $a, $b, $dst, $gfpoly, $t0, $t1, $t2;
-    }
-    return $code;
+    return <<___;
+    vpclmulqdq      \$0x00, $a, $b, $t0        # LO = a_L * b_L
+    vpclmulqdq      \$0x01, $a, $b, $t1        # MI_0 = a_L * b_H
+    vpclmulqdq      \$0x10, $a, $b, $t2        # MI_1 = a_H * b_L
+    vpxord          $t2, $t1, $t1              # MI = MI_0 + MI_1
+    vpclmulqdq      \$0x01, $t0, $gfpoly, $t2  # LO_L*(x^63 + x^62 + x^57)
+    vpshufd         \$0x4e, $t0, $t0           # Swap halves of LO
+    vpternlogd      \$0x96, $t2, $t0, $t1      # Fold LO into MI
+    vpclmulqdq      \$0x11, $a, $b, $dst       # HI = a_H * b_H
+    vpclmulqdq      \$0x01, $t1, $gfpoly, $t0  # MI_L*(x^63 + x^62 + x^57)
+    vpshufd         \$0x4e, $t1, $t1           # Swap halves of MI
+    vpternlogd      \$0x96, $t0, $t1, $dst     # Fold MI into HI
+___
 }
 
 # GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
@@ -423,7 +388,7 @@
 }
 
 # Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
-# reduced products in \hi.  See _ghash_mul_step for explanation of reduction.
+# reduced products in \hi.  See _ghash_mul for explanation of reduction.
 sub _ghash_reduce {
     my ( $lo, $mi, $hi, $gfpoly, $t0 ) = @_;
     return <<___;
@@ -477,7 +442,7 @@
     # interpretation of polynomial coefficients), which can also be
     # interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121
     # + 1 using the alternative, natural interpretation of polynomial
-    # coefficients.  For details, see the comment above _ghash_mul_step.
+    # coefficients.  For details, see the comment above _ghash_mul.
     #
     # Either way, for the multiplication the concrete operation performed
     # is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
@@ -596,8 +561,8 @@
 #     128-bits each.  This leaves VL/16 128-bit intermediate values.
 #   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
 #
-# See _ghash_mul_step for the full explanation of the operations performed for
-# each individual finite field multiplication and reduction.
+# See _ghash_mul for the full explanation of the operations performed for each
+# individual finite field multiplication and reduction.
 sub _ghash_step_4x {
     my ($i) = @_;
     if ( $i == 0 ) {