| #!/usr/bin/env perl |
| # Copyright 2024 The BoringSSL Authors |
| # |
| # Permission to use, copy, modify, and/or distribute this software for any |
| # purpose with or without fee is hereby granted, provided that the above |
| # copyright notice and this permission notice appear in all copies. |
| # |
| # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
| # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
| # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
| # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| # |
| #------------------------------------------------------------------------------ |
| # |
| # VAES and VPCLMULQDQ optimized AES-GCM for x86_64 (AVX2 version) |
| # |
| # This is similar to aes-gcm-avx10-x86_64.pl, but it uses AVX2 instead of AVX512 |
| # / AVX10. This means it can only use 16 vector registers instead of 32, the |
| # maximum vector length is 32 bytes, and some instructions such as vpternlogd |
| # and masked loads/stores are unavailable. However, it is able to run on CPUs |
| # that have VAES without AVX512 / AVX10, namely AMD Zen 3 (including "Milan" |
| # server processors) and some Intel client CPUs such as Alder Lake. |
| # |
| # This implementation also uses Karatsuba multiplication instead of schoolbook |
| # multiplication for GHASH in its main loop. This does not help much on Intel, |
| # but it improves performance by ~5% on AMD Zen 3 which is the main target for |
| # this implementation. Other factors weighing slightly in favor of Karatsuba |
| # multiplication in this implementation are the lower maximum vector length |
| # (which means there is space left in the Htable array to cache the halves of |
| # the key powers XOR'd together) and the unavailability of the vpternlogd |
| # instruction (which helped schoolbook a bit more than Karatsuba). |
| |
| use strict; |
| |
| my $flavour = shift; |
| my $output = shift; |
| if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; } |
| |
| my $win64; |
| my @argregs; |
| if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) { |
| $win64 = 1; |
| @argregs = ( "%rcx", "%rdx", "%r8", "%r9" ); |
| } |
| else { |
| $win64 = 0; |
| @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" ); |
| } |
| |
| $0 =~ m/(.*[\/\\])[^\/\\]+$/; |
| my $dir = $1; |
| my $xlate; |
| ( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate ) |
| or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate ) |
| or die "can't locate x86_64-xlate.pl"; |
| |
| open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; |
| *STDOUT = *OUT; |
| |
| my $g_cur_func_name; |
| my $g_cur_func_uses_seh; |
| my @g_cur_func_saved_gpregs; |
| my @g_cur_func_saved_xmmregs; |
| |
| sub _begin_func { |
| my ( $funcname, $uses_seh ) = @_; |
| $g_cur_func_name = $funcname; |
| $g_cur_func_uses_seh = $uses_seh; |
| @g_cur_func_saved_gpregs = (); |
| @g_cur_func_saved_xmmregs = (); |
| return <<___; |
| .globl $funcname |
| .type $funcname,\@abi-omnipotent |
| .align 32 |
| $funcname: |
| .cfi_startproc |
| @{[ $uses_seh ? ".seh_startproc" : "" ]} |
| _CET_ENDBR |
| ___ |
| } |
| |
| # Push a list of general purpose registers onto the stack. |
| sub _save_gpregs { |
| my @gpregs = @_; |
| my $code = ""; |
| die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh; |
| die "_save_gpregs can only be called once per function" |
| if @g_cur_func_saved_gpregs; |
| die "Order must be _save_gpregs, then _save_xmmregs" |
| if @g_cur_func_saved_xmmregs; |
| @g_cur_func_saved_gpregs = @gpregs; |
| for my $reg (@gpregs) { |
| $code .= "push $reg\n"; |
| if ($win64) { |
| $code .= ".seh_pushreg $reg\n"; |
| } |
| else { |
| $code .= ".cfi_push $reg\n"; |
| } |
| } |
| return $code; |
| } |
| |
| # Push a list of xmm registers onto the stack if the target is Windows. |
| sub _save_xmmregs { |
| my @xmmregs = @_; |
| my $num_xmmregs = scalar @xmmregs; |
| my $code = ""; |
| die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh; |
| die "_save_xmmregs can only be called once per function" |
| if @g_cur_func_saved_xmmregs; |
| if ( $win64 and $num_xmmregs > 0 ) { |
| @g_cur_func_saved_xmmregs = @xmmregs; |
| my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; |
| my $alloc_size = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 ); |
| $code .= "sub \$$alloc_size, %rsp\n"; |
| $code .= ".seh_stackalloc $alloc_size\n"; |
| for my $i ( 0 .. $num_xmmregs - 1 ) { |
| my $reg_num = $xmmregs[$i]; |
| my $pos = 16 * $i; |
| $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n"; |
| $code .= ".seh_savexmm %xmm$reg_num, $pos\n"; |
| } |
| } |
| return $code; |
| } |
| |
| sub _end_func { |
| my $code = ""; |
| |
| # Restore any xmm registers that were saved earlier. |
| my $num_xmmregs = scalar @g_cur_func_saved_xmmregs; |
| if ( $win64 and $num_xmmregs > 0 ) { |
| my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; |
| my $alloc_size = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 ); |
| for my $i ( 0 .. $num_xmmregs - 1 ) { |
| my $reg_num = $g_cur_func_saved_xmmregs[$i]; |
| my $pos = 16 * $i; |
| $code .= "movdqa $pos(%rsp), %xmm$reg_num\n"; |
| } |
| $code .= "add \$$alloc_size, %rsp\n"; |
| } |
| |
| # Restore any general purpose registers that were saved earlier. |
| for my $reg ( reverse @g_cur_func_saved_gpregs ) { |
| $code .= "pop $reg\n"; |
| if ( !$win64 ) { |
| $code .= ".cfi_pop $reg\n"; |
| } |
| } |
| |
| $code .= <<___; |
| ret |
| @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]} |
| .cfi_endproc |
| .size $g_cur_func_name, . - $g_cur_func_name |
| ___ |
| return $code; |
| } |
| |
| my $code = <<___; |
| .section .rodata |
| .align 16 |
| |
| # A shuffle mask that reflects the bytes of 16-byte blocks |
| .Lbswap_mask: |
| .quad 0x08090a0b0c0d0e0f, 0x0001020304050607 |
| |
| # This is the GHASH reducing polynomial without its constant term, i.e. |
| # x^128 + x^7 + x^2 + x, represented using the backwards mapping |
| # between bits and polynomial coefficients. |
| # |
| # Alternatively, it can be interpreted as the naturally-ordered |
| # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the |
| # "reversed" GHASH reducing polynomial without its x^128 term. |
| .Lgfpoly: |
| .quad 1, 0xc200000000000000 |
| |
| # Same as above, but with the (1 << 64) bit set. |
| .Lgfpoly_and_internal_carrybit: |
| .quad 1, 0xc200000000000001 |
| |
| .align 32 |
| # The below constants are used for incrementing the counter blocks. |
| .Lctr_pattern: |
| .quad 0, 0 |
| .quad 1, 0 |
| .Linc_2blocks: |
| .quad 2, 0 |
| .quad 2, 0 |
| |
| .text |
| ___ |
| |
| # We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the |
| # 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication) |
| # in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15]. |
| my $NUM_H_POWERS = 8; |
| my $OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16; |
| my $OFFSETOF_H_POWERS_XORED = $OFFSETOFEND_H_POWERS; |
| |
| # Offset to 'rounds' in AES_KEY struct |
| my $OFFSETOF_AES_ROUNDS = 240; |
| |
| # GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store |
| # the reduced products in \dst. Uses schoolbook multiplication. |
| sub _ghash_mul { |
| my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_; |
| return <<___; |
| vpclmulqdq \$0x00, $a, $b, $t0 # LO = a_L * b_L |
| vpclmulqdq \$0x01, $a, $b, $t1 # MI_0 = a_L * b_H |
| vpclmulqdq \$0x10, $a, $b, $t2 # MI_1 = a_H * b_L |
| vpxor $t2, $t1, $t1 # MI = MI_0 + MI_1 |
| vpclmulqdq \$0x01, $t0, $gfpoly, $t2 # LO_L*(x^63 + x^62 + x^57) |
| vpshufd \$0x4e, $t0, $t0 # Swap halves of LO |
| vpxor $t0, $t1, $t1 # Fold LO into MI (part 1) |
| vpxor $t2, $t1, $t1 # Fold LO into MI (part 2) |
| vpclmulqdq \$0x11, $a, $b, $dst # HI = a_H * b_H |
| vpclmulqdq \$0x01, $t1, $gfpoly, $t0 # MI_L*(x^63 + x^62 + x^57) |
| vpshufd \$0x4e, $t1, $t1 # Swap halves of MI |
| vpxor $t1, $dst, $dst # Fold MI into HI (part 1) |
| vpxor $t0, $dst, $dst # Fold MI into HI (part 2) |
| ___ |
| } |
| |
| # void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]); |
| # |
| # Initialize |Htable| with powers of the GHASH subkey |H|. |
| # |
| # We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the |
| # 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication) |
| # in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15]. |
| $code .= _begin_func "gcm_init_vpclmulqdq_avx2", 1; |
| { |
| my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ]; |
| my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" ); |
| my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" ); |
| my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" ); |
| my ( $H_CUR, $H_CUR_XMM ) = ( "%ymm3", "%xmm3" ); |
| my ( $H_CUR2, $H_CUR2_XMM ) = ( "%ymm4", "%xmm4" ); |
| my ( $H_INC, $H_INC_XMM ) = ( "%ymm5", "%xmm5" ); |
| my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm6", "%xmm6" ); |
| |
| $code .= <<___; |
| @{[ _save_xmmregs (6) ]} |
| .seh_endprologue |
| |
| # Load the byte-reflected hash subkey. BoringSSL provides it in |
| # byte-reflected form except the two halves are in the wrong order. |
| vpshufd \$0x4e, ($H_PTR), $H_CUR_XMM |
| |
| # Finish preprocessing the byte-reflected hash subkey by multiplying it by |
| # x^-1 ("standard" interpretation of polynomial coefficients) or |
| # equivalently x^1 (natural interpretation). This gets the key into a |
| # format that avoids having to bit-reflect the data blocks later. |
| vpshufd \$0xd3, $H_CUR_XMM, $TMP0_XMM |
| vpsrad \$31, $TMP0_XMM, $TMP0_XMM |
| vpaddq $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM |
| vpand .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $TMP0_XMM |
| vpxor $TMP0_XMM, $H_CUR_XMM, $H_CUR_XMM |
| |
| vbroadcasti128 .Lgfpoly(%rip), $GFPOLY |
| |
| # Square H^1 to get H^2. |
| @{[ _ghash_mul $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM, |
| $TMP0_XMM, $TMP1_XMM, $TMP2_XMM ]} |
| |
| # Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2]. |
| vinserti128 \$1, $H_CUR_XMM, $H_INC, $H_CUR |
| vinserti128 \$1, $H_INC_XMM, $H_INC, $H_INC |
| |
| # Compute H_CUR2 = [H^4, H^3]. |
| @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} |
| |
| # Store [H^2, H^1] and [H^4, H^3]. |
| vmovdqu $H_CUR, 3*32($HTABLE) |
| vmovdqu $H_CUR2, 2*32($HTABLE) |
| |
| # For Karatsuba multiplication: compute and store the two 64-bit halves of |
| # each key power XOR'd together. Order is 4,2,3,1. |
| vpunpcklqdq $H_CUR, $H_CUR2, $TMP0 |
| vpunpckhqdq $H_CUR, $H_CUR2, $TMP1 |
| vpxor $TMP1, $TMP0, $TMP0 |
| vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED+32($HTABLE) |
| |
| # Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7]. |
| @{[ _ghash_mul $H_INC, $H_CUR2, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} |
| @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} |
| vmovdqu $H_CUR, 1*32($HTABLE) |
| vmovdqu $H_CUR2, 0*32($HTABLE) |
| |
| # Again, compute and store the two 64-bit halves of each key power XOR'd |
| # together. Order is 8,6,7,5. |
| vpunpcklqdq $H_CUR, $H_CUR2, $TMP0 |
| vpunpckhqdq $H_CUR, $H_CUR2, $TMP1 |
| vpxor $TMP1, $TMP0, $TMP0 |
| vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED($HTABLE) |
| |
| vzeroupper |
| ___ |
| } |
| $code .= _end_func; |
| |
| # Do one step of the GHASH update of four vectors of data blocks. |
| # $i: the step to do, 0 through 9 |
| # $ghashdata_ptr: pointer to the data blocks (ciphertext or AAD) |
| # $htable: pointer to the Htable for the key |
| # $bswap_mask: mask for reflecting the bytes of blocks |
| # $h_pow[2-1]_xored: XOR'd key powers cached from Htable |
| # $tmp[0-2]: temporary registers. $tmp[1-2] must be preserved across steps. |
| # $lo, $mi: working state for this macro that must be preserved across steps |
| # $ghash_acc: the GHASH accumulator (input/output) |
| sub _ghash_step_4x { |
| my ( |
| $i, $ghashdata_ptr, $htable, $bswap_mask, |
| $h_pow2_xored, $h_pow1_xored, $tmp0, $tmp0_xmm, |
| $tmp1, $tmp2, $lo, $mi, |
| $ghash_acc, $ghash_acc_xmm |
| ) = @_; |
| my ( $hi, $hi_xmm ) = ( $ghash_acc, $ghash_acc_xmm ); # alias |
| if ( $i == 0 ) { |
| return <<___; |
| # First vector |
| vmovdqu 0*32($ghashdata_ptr), $tmp1 |
| vpshufb $bswap_mask, $tmp1, $tmp1 |
| vmovdqu 0*32($htable), $tmp2 |
| vpxor $ghash_acc, $tmp1, $tmp1 |
| vpclmulqdq \$0x00, $tmp2, $tmp1, $lo |
| vpclmulqdq \$0x11, $tmp2, $tmp1, $hi |
| vpunpckhqdq $tmp1, $tmp1, $tmp0 |
| vpxor $tmp1, $tmp0, $tmp0 |
| vpclmulqdq \$0x00, $h_pow2_xored, $tmp0, $mi |
| ___ |
| } |
| elsif ( $i == 1 ) { |
| return <<___; |
| ___ |
| } |
| elsif ( $i == 2 ) { |
| return <<___; |
| # Second vector |
| vmovdqu 1*32($ghashdata_ptr), $tmp1 |
| vpshufb $bswap_mask, $tmp1, $tmp1 |
| vmovdqu 1*32($htable), $tmp2 |
| vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 |
| vpxor $tmp0, $lo, $lo |
| vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 |
| vpxor $tmp0, $hi, $hi |
| vpunpckhqdq $tmp1, $tmp1, $tmp0 |
| vpxor $tmp1, $tmp0, $tmp0 |
| vpclmulqdq \$0x10, $h_pow2_xored, $tmp0, $tmp0 |
| vpxor $tmp0, $mi, $mi |
| ___ |
| } |
| elsif ( $i == 3 ) { |
| return <<___; |
| # Third vector |
| vmovdqu 2*32($ghashdata_ptr), $tmp1 |
| vpshufb $bswap_mask, $tmp1, $tmp1 |
| vmovdqu 2*32($htable), $tmp2 |
| ___ |
| } |
| elsif ( $i == 4 ) { |
| return <<___; |
| vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 |
| vpxor $tmp0, $lo, $lo |
| vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 |
| vpxor $tmp0, $hi, $hi |
| ___ |
| } |
| elsif ( $i == 5 ) { |
| return <<___; |
| vpunpckhqdq $tmp1, $tmp1, $tmp0 |
| vpxor $tmp1, $tmp0, $tmp0 |
| vpclmulqdq \$0x00, $h_pow1_xored, $tmp0, $tmp0 |
| vpxor $tmp0, $mi, $mi |
| |
| # Fourth vector |
| vmovdqu 3*32($ghashdata_ptr), $tmp1 |
| vpshufb $bswap_mask, $tmp1, $tmp1 |
| ___ |
| } |
| elsif ( $i == 6 ) { |
| return <<___; |
| vmovdqu 3*32($htable), $tmp2 |
| vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 |
| vpxor $tmp0, $lo, $lo |
| vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 |
| vpxor $tmp0, $hi, $hi |
| vpunpckhqdq $tmp1, $tmp1, $tmp0 |
| vpxor $tmp1, $tmp0, $tmp0 |
| vpclmulqdq \$0x10, $h_pow1_xored, $tmp0, $tmp0 |
| vpxor $tmp0, $mi, $mi |
| ___ |
| } |
| elsif ( $i == 7 ) { |
| return <<___; |
| # Finalize 'mi' following Karatsuba multiplication. |
| vpxor $lo, $mi, $mi |
| vpxor $hi, $mi, $mi |
| |
| # Fold lo into mi. |
| vbroadcasti128 .Lgfpoly(%rip), $tmp2 |
| vpclmulqdq \$0x01, $lo, $tmp2, $tmp0 |
| vpshufd \$0x4e, $lo, $lo |
| vpxor $lo, $mi, $mi |
| vpxor $tmp0, $mi, $mi |
| ___ |
| } |
| elsif ( $i == 8 ) { |
| return <<___; |
| # Fold mi into hi. |
| vpclmulqdq \$0x01, $mi, $tmp2, $tmp0 |
| vpshufd \$0x4e, $mi, $mi |
| vpxor $mi, $hi, $hi |
| vpxor $tmp0, $hi, $hi |
| ___ |
| } |
| elsif ( $i == 9 ) { |
| return <<___; |
| vextracti128 \$1, $hi, $tmp0_xmm |
| vpxor $tmp0_xmm, $hi_xmm, $ghash_acc_xmm |
| ___ |
| } |
| } |
| |
| sub _ghash_4x { |
| my $code = ""; |
| for my $i ( 0 .. 9 ) { |
| $code .= _ghash_step_4x $i, @_; |
| } |
| return $code; |
| } |
| |
| # void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]); |
| $code .= _begin_func "gcm_gmult_vpclmulqdq_avx2", 1; |
| { |
| my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ]; |
| my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) = |
| map( "%xmm$_", ( 0 .. 6 ) ); |
| |
| $code .= <<___; |
| @{[ _save_xmmregs (6) ]} |
| .seh_endprologue |
| |
| vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC |
| vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK |
| vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1 |
| vmovdqu .Lgfpoly(%rip), $GFPOLY |
| vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC |
| |
| @{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]} |
| |
| vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC |
| vmovdqu $GHASH_ACC, ($GHASH_ACC_PTR) |
| ___ |
| } |
| $code .= _end_func; |
| |
| # void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16], |
| # const uint8_t *in, size_t len); |
| # |
| # Using the key |Htable|, update the GHASH accumulator |Xi| with the data given |
| # by |in| and |len|. |len| must be a multiple of 16. |
| # |
| # This function handles large amounts of AAD efficiently, while also keeping the |
| # overhead low for small amounts of AAD which is the common case. TLS uses less |
| # than one block of AAD, but (uncommonly) other use cases may use much more. |
| $code .= _begin_func "gcm_ghash_vpclmulqdq_avx2", 1; |
| { |
| # Function arguments |
| my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ]; |
| |
| # Additional local variables |
| my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" ); |
| my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" ); |
| my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" ); |
| my ( $LO, $LO_XMM ) = ( "%ymm3", "%xmm3" ); |
| my ( $MI, $MI_XMM ) = ( "%ymm4", "%xmm4" ); |
| my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm5", "%xmm5" ); |
| my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" ); |
| my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm7", "%xmm7" ); |
| my $H_POW2_XORED = "%ymm8"; |
| my $H_POW1_XORED = "%ymm9"; |
| |
| $code .= <<___; |
| @{[ _save_xmmregs (6 .. 9) ]} |
| .seh_endprologue |
| |
| vbroadcasti128 .Lbswap_mask(%rip), $BSWAP_MASK |
| vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM |
| vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM |
| vbroadcasti128 .Lgfpoly(%rip), $GFPOLY |
| |
| # Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128. |
| cmp \$32, $AADLEN |
| jb .Lghash_lastblock |
| |
| cmp \$127, $AADLEN |
| jbe .Lghash_loop_1x |
| |
| # Update GHASH with 128 bytes of AAD at a time. |
| vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED |
| vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED |
| .Lghash_loop_4x: |
| @{[ _ghash_4x $AAD, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, $H_POW1_XORED, |
| $TMP0, $TMP0_XMM, $TMP1, $TMP2, $LO, $MI, $GHASH_ACC, |
| $GHASH_ACC_XMM ]} |
| sub \$-128, $AAD # 128 is 4 bytes, -128 is 1 byte |
| add \$-128, $AADLEN |
| cmp \$127, $AADLEN |
| ja .Lghash_loop_4x |
| |
| # Update GHASH with 32 bytes of AAD at a time. |
| cmp \$32, $AADLEN |
| jb .Lghash_loop_1x_done |
| .Lghash_loop_1x: |
| vmovdqu ($AAD), $TMP0 |
| vpshufb $BSWAP_MASK, $TMP0, $TMP0 |
| vpxor $TMP0, $GHASH_ACC, $GHASH_ACC |
| vmovdqu $OFFSETOFEND_H_POWERS-32($HTABLE), $TMP0 |
| @{[ _ghash_mul $TMP0, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $TMP1, $TMP2, $LO ]} |
| vextracti128 \$1, $GHASH_ACC, $TMP0_XMM |
| vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM |
| add \$32, $AAD |
| sub \$32, $AADLEN |
| cmp \$32, $AADLEN |
| jae .Lghash_loop_1x |
| .Lghash_loop_1x_done: |
| # Issue the vzeroupper that is needed after using ymm registers. Do it here |
| # instead of at the end, to minimize overhead for small AADLEN. |
| vzeroupper |
| |
| # Update GHASH with the remaining 16-byte block if any. |
| .Lghash_lastblock: |
| test $AADLEN, $AADLEN |
| jz .Lghash_done |
| vmovdqu ($AAD), $TMP0_XMM |
| vpshufb $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM |
| vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM |
| vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM |
| @{[ _ghash_mul $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM, |
| $TMP1_XMM, $TMP2_XMM, $LO_XMM ]} |
| |
| .Lghash_done: |
| # Store the updated GHASH accumulator back to memory. |
| vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM |
| vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) |
| ___ |
| } |
| $code .= _end_func; |
| |
| sub _vaesenc_4x { |
| my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_; |
| return <<___; |
| vaesenc $round_key, $aesdata0, $aesdata0 |
| vaesenc $round_key, $aesdata1, $aesdata1 |
| vaesenc $round_key, $aesdata2, $aesdata2 |
| vaesenc $round_key, $aesdata3, $aesdata3 |
| ___ |
| } |
| |
| sub _ctr_begin_4x { |
| my ( |
| $le_ctr, $bswap_mask, $rndkey0, $aesdata0, |
| $aesdata1, $aesdata2, $aesdata3, $tmp |
| ) = @_; |
| return <<___; |
| # Increment le_ctr four times to generate four vectors of little-endian |
| # counter blocks, swap each to big-endian, and store them in aesdata[0-3]. |
| vmovdqu .Linc_2blocks(%rip), $tmp |
| vpshufb $bswap_mask, $le_ctr, $aesdata0 |
| vpaddd $tmp, $le_ctr, $le_ctr |
| vpshufb $bswap_mask, $le_ctr, $aesdata1 |
| vpaddd $tmp, $le_ctr, $le_ctr |
| vpshufb $bswap_mask, $le_ctr, $aesdata2 |
| vpaddd $tmp, $le_ctr, $le_ctr |
| vpshufb $bswap_mask, $le_ctr, $aesdata3 |
| vpaddd $tmp, $le_ctr, $le_ctr |
| |
| # AES "round zero": XOR in the zero-th round key. |
| vpxor $rndkey0, $aesdata0, $aesdata0 |
| vpxor $rndkey0, $aesdata1, $aesdata1 |
| vpxor $rndkey0, $aesdata2, $aesdata2 |
| vpxor $rndkey0, $aesdata3, $aesdata3 |
| ___ |
| } |
| |
| # Do the last AES round for four vectors of counter blocks, XOR four vectors of |
| # source data with the resulting keystream blocks, and write the result to the |
| # destination buffer. The implementation differs slightly as it takes advantage |
| # of the property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) to reduce |
| # latency, but it has the same effect. |
| sub _aesenclast_and_xor_4x { |
| my ( |
| $src, $dst, $rndkeylast, $aesdata0, |
| $aesdata1, $aesdata2, $aesdata3, $t0, |
| $t1, $t2, $t3 |
| ) = @_; |
| return <<___; |
| vpxor 0*32($src), $rndkeylast, $t0 |
| vpxor 1*32($src), $rndkeylast, $t1 |
| vpxor 2*32($src), $rndkeylast, $t2 |
| vpxor 3*32($src), $rndkeylast, $t3 |
| vaesenclast $t0, $aesdata0, $aesdata0 |
| vaesenclast $t1, $aesdata1, $aesdata1 |
| vaesenclast $t2, $aesdata2, $aesdata2 |
| vaesenclast $t3, $aesdata3, $aesdata3 |
| vmovdqu $aesdata0, 0*32($dst) |
| vmovdqu $aesdata1, 1*32($dst) |
| vmovdqu $aesdata2, 2*32($dst) |
| vmovdqu $aesdata3, 3*32($dst) |
| ___ |
| } |
| |
| my $g_update_macro_expansion_count = 0; |
| |
| # void aes_gcm_{enc,dec}_update_vaes_avx2(const uint8_t *in, uint8_t *out, |
| # size_t len, const AES_KEY *key, |
| # const uint8_t ivec[16], |
| # const u128 Htable[16], |
| # uint8_t Xi[16]); |
| # |
| # This macro generates a GCM encryption or decryption update function with the |
| # above prototype (with \enc selecting which one). The function computes the |
| # next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and |
| # writes the resulting encrypted or decrypted data to |out|. It also updates |
| # the GHASH accumulator |Xi| using the next |len| ciphertext bytes. |
| # |
| # |len| must be a multiple of 16. The caller must do any buffering needed to |
| # ensure this. Both in-place and out-of-place en/decryption are supported. |
| # |
| # |ivec| must give the current counter in big-endian format. This function |
| # loads the counter from |ivec| and increments the loaded counter as needed, but |
| # it does *not* store the updated counter back to |ivec|. The caller must |
| # update |ivec| if any more data segments follow. Internally, only the low |
| # 32-bit word of the counter is incremented, following the GCM standard. |
| sub _aes_gcm_update { |
| my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count; |
| my ($enc) = @_; |
| my $code = ""; |
| |
| # Function arguments |
| my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ) |
| = $win64 |
| ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" ) |
| : ( @argregs[ 0 .. 5 ], "%r12" ); |
| |
| # Additional local variables. |
| # %rax is used as a temporary register. BE_CTR_PTR is also available as a |
| # temporary register after the counter is loaded. |
| |
| # AES key length in bytes |
| my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" ); |
| |
| # Pointer to the last AES round key for the chosen AES variant |
| my $RNDKEYLAST_PTR = "%r11"; |
| |
| # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values |
| # using vpshufb, copied to all 128-bit lanes. |
| my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm0", "%xmm0" ); |
| |
| # GHASH_ACC is the accumulator variable for GHASH. When fully reduced, |
| # only the lowest 128-bit lane can be nonzero. When not fully reduced, |
| # more than one lane may be used, and they need to be XOR'd together. |
| my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm1", "%xmm1" ); |
| |
| # TMP[0-2] are temporary registers. |
| my ( $TMP0, $TMP0_XMM ) = ( "%ymm2", "%xmm2" ); |
| my ( $TMP1, $TMP1_XMM ) = ( "%ymm3", "%xmm3" ); |
| my ( $TMP2, $TMP2_XMM ) = ( "%ymm4", "%xmm4" ); |
| |
| # LO and MI are used to accumulate unreduced GHASH products. |
| my ( $LO, $LO_XMM ) = ( "%ymm5", "%xmm5" ); |
| my ( $MI, $MI_XMM ) = ( "%ymm6", "%xmm6" ); |
| |
| # Cached key powers from Htable |
| my ( $H_POW2_XORED, $H_POW2_XORED_XMM ) = ( "%ymm7", "%xmm7" ); |
| my ( $H_POW1_XORED, $H_POW1_XORED_XMM ) = ( "%ymm8", "%xmm8" ); |
| |
| # RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one. |
| my $RNDKEY0 = "%ymm9"; |
| my $RNDKEYLAST = "%ymm10"; |
| |
| # LE_CTR contains the next set of little-endian counter blocks. |
| my $LE_CTR = "%ymm11"; |
| |
| # AESDATA[0-3] hold the counter blocks that are being encrypted by AES. |
| my ( $AESDATA0, $AESDATA0_XMM ) = ( "%ymm12", "%xmm12" ); |
| my ( $AESDATA1, $AESDATA1_XMM ) = ( "%ymm13", "%xmm13" ); |
| my ( $AESDATA2, $AESDATA2_XMM ) = ( "%ymm14", "%xmm14" ); |
| my ( $AESDATA3, $AESDATA3_XMM ) = ( "%ymm15", "%xmm15" ); |
| my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 ); |
| |
| my @ghash_4x_args = ( |
| $enc ? $DST : $SRC, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, |
| $H_POW1_XORED, $TMP0, $TMP0_XMM, $TMP1, |
| $TMP2, $LO, $MI, $GHASH_ACC, |
| $GHASH_ACC_XMM |
| ); |
| |
| if ($win64) { |
| $code .= <<___; |
| @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]} |
| mov 64(%rsp), $BE_CTR_PTR # arg5 |
| mov 72(%rsp), $HTABLE # arg6 |
| mov 80(%rsp), $GHASH_ACC_PTR # arg7 |
| @{[ _save_xmmregs (6 .. 15) ]} |
| .seh_endprologue |
| ___ |
| } |
| else { |
| $code .= <<___; |
| @{[ _save_gpregs $GHASH_ACC_PTR ]} |
| mov 16(%rsp), $GHASH_ACC_PTR # arg7 |
| ___ |
| } |
| |
| if ($enc) { |
| $code .= <<___; |
| #ifdef BORINGSSL_DISPATCH_TEST |
| .extern BORINGSSL_function_hit |
| movb \$1,BORINGSSL_function_hit+8(%rip) |
| #endif |
| ___ |
| } |
| $code .= <<___; |
| vbroadcasti128 .Lbswap_mask(%rip), $BSWAP_MASK |
| |
| # Load the GHASH accumulator and the starting counter. |
| # BoringSSL passes these values in big endian format. |
| vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM |
| vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM |
| vbroadcasti128 ($BE_CTR_PTR), $LE_CTR |
| vpshufb $BSWAP_MASK, $LE_CTR, $LE_CTR |
| |
| # Load the AES key length in bytes. BoringSSL stores number of rounds |
| # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20. |
| movl $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN |
| lea -20(,$AESKEYLEN,4), $AESKEYLEN |
| |
| # Make RNDKEYLAST_PTR point to the last AES round key. This is the |
| # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 |
| # respectively. Then load the zero-th and last round keys. |
| lea 6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR |
| vbroadcasti128 ($AESKEY), $RNDKEY0 |
| vbroadcasti128 ($RNDKEYLAST_PTR), $RNDKEYLAST |
| |
| # Finish initializing LE_CTR by adding 1 to the second block. |
| vpaddd .Lctr_pattern(%rip), $LE_CTR, $LE_CTR |
| |
| # If there are at least 128 bytes of data, then continue into the loop that |
| # processes 128 bytes of data at a time. Otherwise skip it. |
| cmp \$127, $DATALEN |
| jbe .Lcrypt_loop_4x_done$local_label_suffix |
| |
| vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED |
| vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED |
| ___ |
| |
| # Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time. |
| |
| if ($enc) { |
| $code .= <<___; |
| # Encrypt the first 4 vectors of plaintext blocks. |
| @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]} |
| lea 16($AESKEY), %rax |
| .Lvaesenc_loop_first_4_vecs$local_label_suffix: |
| vbroadcasti128 (%rax), $TMP0 |
| @{[ _vaesenc_4x $TMP0, @AESDATA ]} |
| add \$16, %rax |
| cmp %rax, $RNDKEYLAST_PTR |
| jne .Lvaesenc_loop_first_4_vecs$local_label_suffix |
| @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, |
| $TMP0, $TMP1, $LO, $MI ]} |
| sub \$-128, $SRC # 128 is 4 bytes, -128 is 1 byte |
| add \$-128, $DATALEN |
| cmp \$127, $DATALEN |
| jbe .Lghash_last_ciphertext_4x$local_label_suffix |
| ___ |
| } |
| |
| $code .= <<___; |
| .align 16 |
| .Lcrypt_loop_4x$local_label_suffix: |
| |
| # Start the AES encryption of the counter blocks. |
| @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]} |
| cmp \$24, $AESKEYLEN |
| jl .Laes128$local_label_suffix |
| je .Laes192$local_label_suffix |
| # AES-256 |
| vbroadcasti128 -13*16($RNDKEYLAST_PTR), $TMP0 |
| @{[ _vaesenc_4x $TMP0, @AESDATA ]} |
| vbroadcasti128 -12*16($RNDKEYLAST_PTR), $TMP0 |
| @{[ _vaesenc_4x $TMP0, @AESDATA ]} |
| .Laes192$local_label_suffix: |
| vbroadcasti128 -11*16($RNDKEYLAST_PTR), $TMP0 |
| @{[ _vaesenc_4x $TMP0, @AESDATA ]} |
| vbroadcasti128 -10*16($RNDKEYLAST_PTR), $TMP0 |
| @{[ _vaesenc_4x $TMP0, @AESDATA ]} |
| .Laes128$local_label_suffix: |
| ___ |
| |
| # Prefetch the source data 512 bytes ahead into the L1 data cache, to |
| # improve performance when the hardware prefetcher is disabled. Assumes the |
| # L1 data cache line size is 64 bytes (de facto standard on x86_64). |
| $code .= "prefetcht0 512($SRC)\n"; |
| $code .= "prefetcht0 512+64($SRC)\n"; |
| |
| # Finish the AES encryption of the counter blocks in AESDATA[0-3], |
| # interleaved with the GHASH update of the ciphertext blocks. |
| for my $i ( reverse 1 .. 9 ) { |
| $code .= <<___; |
| @{[ _ghash_step_4x 9-$i, @ghash_4x_args ]} |
| vbroadcasti128 -$i*16($RNDKEYLAST_PTR), $TMP0 |
| @{[ _vaesenc_4x $TMP0, @AESDATA ]} |
| ___ |
| } |
| $code .= <<___; |
| @{[ _ghash_step_4x 9, @ghash_4x_args ]} |
| |
| @{[ $enc ? "sub \$-128, $DST" : "" ]} # 128 is 4 bytes, -128 is 1 byte |
| @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, |
| $TMP0, $TMP1, $LO, $MI ]} |
| sub \$-128, $SRC |
| @{[ !$enc ? "sub \$-128, $DST" : "" ]} |
| add \$-128, $DATALEN |
| cmp \$127, $DATALEN |
| ja .Lcrypt_loop_4x$local_label_suffix |
| ___ |
| |
| if ($enc) { |
| |
| # Update GHASH with the last set of ciphertext blocks. |
| $code .= <<___; |
| .Lghash_last_ciphertext_4x$local_label_suffix: |
| @{[ _ghash_4x @ghash_4x_args ]} |
| sub \$-128, $DST |
| ___ |
| } |
| |
| my $POWERS_PTR = $BE_CTR_PTR; # BE_CTR_PTR is free to be reused. |
| my ( $HI, $HI_XMM ) = ( $H_POW2_XORED, $H_POW2_XORED_XMM ); # reuse |
| |
| $code .= <<___; |
| .Lcrypt_loop_4x_done$local_label_suffix: |
| # Check whether any data remains. |
| test $DATALEN, $DATALEN |
| jz .Ldone$local_label_suffix |
| |
| # DATALEN is in [16, 32, 48, 64, 80, 96, 112]. |
| |
| # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N |
| # is the number of blocks that remain. |
| lea $OFFSETOFEND_H_POWERS($HTABLE), $POWERS_PTR |
| sub $DATALEN, $POWERS_PTR |
| |
| # Start collecting the unreduced GHASH intermediate value LO, MI, HI. |
| vpxor $LO_XMM, $LO_XMM, $LO_XMM |
| vpxor $MI_XMM, $MI_XMM, $MI_XMM |
| vpxor $HI_XMM, $HI_XMM, $HI_XMM |
| |
| cmp \$64, $DATALEN |
| jb .Llessthan64bytes$local_label_suffix |
| |
| # DATALEN is in [64, 80, 96, 112]. Encrypt two vectors of counter blocks. |
| vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0 |
| vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR |
| vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1 |
| vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR |
| vpxor $RNDKEY0, $AESDATA0, $AESDATA0 |
| vpxor $RNDKEY0, $AESDATA1, $AESDATA1 |
| lea 16($AESKEY), %rax |
| .Lvaesenc_loop_tail_1$local_label_suffix: |
| vbroadcasti128 (%rax), $TMP0 |
| vaesenc $TMP0, $AESDATA0, $AESDATA0 |
| vaesenc $TMP0, $AESDATA1, $AESDATA1 |
| add \$16, %rax |
| cmp %rax, $RNDKEYLAST_PTR |
| jne .Lvaesenc_loop_tail_1$local_label_suffix |
| vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0 |
| vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1 |
| |
| # XOR the data with the two vectors of keystream blocks. |
| vmovdqu 0($SRC), $TMP0 |
| vmovdqu 32($SRC), $TMP1 |
| vpxor $TMP0, $AESDATA0, $AESDATA0 |
| vpxor $TMP1, $AESDATA1, $AESDATA1 |
| vmovdqu $AESDATA0, 0($DST) |
| vmovdqu $AESDATA1, 32($DST) |
| |
| # Update GHASH with two vectors of ciphertext blocks, without reducing. |
| vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 |
| vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA1 : $TMP1 ]}, $AESDATA1 |
| vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 |
| vmovdqu ($POWERS_PTR), $TMP0 |
| vmovdqu 32($POWERS_PTR), $TMP1 |
| vpclmulqdq \$0x00, $TMP0, $AESDATA0, $LO |
| vpclmulqdq \$0x01, $TMP0, $AESDATA0, $MI |
| vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2 |
| vpxor $TMP2, $MI, $MI |
| vpclmulqdq \$0x11, $TMP0, $AESDATA0, $HI |
| vpclmulqdq \$0x00, $TMP1, $AESDATA1, $TMP2 |
| vpxor $TMP2, $LO, $LO |
| vpclmulqdq \$0x01, $TMP1, $AESDATA1, $TMP2 |
| vpxor $TMP2, $MI, $MI |
| vpclmulqdq \$0x10, $TMP1, $AESDATA1, $TMP2 |
| vpxor $TMP2, $MI, $MI |
| vpclmulqdq \$0x11, $TMP1, $AESDATA1, $TMP2 |
| vpxor $TMP2, $HI, $HI |
| |
| add \$64, $POWERS_PTR |
| add \$64, $SRC |
| add \$64, $DST |
| sub \$64, $DATALEN |
| jz .Lreduce$local_label_suffix |
| |
| vpxor $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM |
| |
| # DATALEN is in [16, 32, 48]. Encrypt two last vectors of counter blocks. |
| .Llessthan64bytes$local_label_suffix: |
| vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0 |
| vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR |
| vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1 |
| vpxor $RNDKEY0, $AESDATA0, $AESDATA0 |
| vpxor $RNDKEY0, $AESDATA1, $AESDATA1 |
| lea 16($AESKEY), %rax |
| .Lvaesenc_loop_tail_2$local_label_suffix: |
| vbroadcasti128 (%rax), $TMP0 |
| vaesenc $TMP0, $AESDATA0, $AESDATA0 |
| vaesenc $TMP0, $AESDATA1, $AESDATA1 |
| add \$16, %rax |
| cmp %rax, $RNDKEYLAST_PTR |
| jne .Lvaesenc_loop_tail_2$local_label_suffix |
| vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0 |
| vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1 |
| |
| # XOR the remaining data with the keystream blocks, and update GHASH with |
| # the remaining ciphertext blocks without reducing. |
| |
| cmp \$32, $DATALEN |
| jb .Lxor_one_block$local_label_suffix |
| je .Lxor_two_blocks$local_label_suffix |
| |
| .Lxor_three_blocks$local_label_suffix: |
| vmovdqu 0($SRC), $TMP0 |
| vmovdqu 32($SRC), $TMP1_XMM |
| vpxor $TMP0, $AESDATA0, $AESDATA0 |
| vpxor $TMP1_XMM, $AESDATA1_XMM, $AESDATA1_XMM |
| vmovdqu $AESDATA0, 0($DST) |
| vmovdqu $AESDATA1_XMM, 32($DST) |
| |
| vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 |
| vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA1_XMM : $TMP1_XMM ]}, $AESDATA1_XMM |
| vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 |
| vmovdqu ($POWERS_PTR), $TMP0 |
| vmovdqu 32($POWERS_PTR), $TMP1_XMM |
| vpclmulqdq \$0x00, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM |
| vpxor $TMP2, $LO, $LO |
| vpclmulqdq \$0x01, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM |
| vpxor $TMP2, $MI, $MI |
| vpclmulqdq \$0x10, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM |
| vpxor $TMP2, $MI, $MI |
| vpclmulqdq \$0x11, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM |
| vpxor $TMP2, $HI, $HI |
| jmp .Lghash_mul_one_vec_unreduced$local_label_suffix |
| |
| .Lxor_two_blocks$local_label_suffix: |
| vmovdqu ($SRC), $TMP0 |
| vpxor $TMP0, $AESDATA0, $AESDATA0 |
| vmovdqu $AESDATA0, ($DST) |
| vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 |
| vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 |
| vmovdqu ($POWERS_PTR), $TMP0 |
| jmp .Lghash_mul_one_vec_unreduced$local_label_suffix |
| |
| .Lxor_one_block$local_label_suffix: |
| vmovdqu ($SRC), $TMP0_XMM |
| vpxor $TMP0_XMM, $AESDATA0_XMM, $AESDATA0_XMM |
| vmovdqu $AESDATA0_XMM, ($DST) |
| vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA0_XMM : $TMP0_XMM ]}, $AESDATA0_XMM |
| vpxor $GHASH_ACC_XMM, $AESDATA0_XMM, $AESDATA0_XMM |
| vmovdqu ($POWERS_PTR), $TMP0_XMM |
| |
| .Lghash_mul_one_vec_unreduced$local_label_suffix: |
| vpclmulqdq \$0x00, $TMP0, $AESDATA0, $TMP2 |
| vpxor $TMP2, $LO, $LO |
| vpclmulqdq \$0x01, $TMP0, $AESDATA0, $TMP2 |
| vpxor $TMP2, $MI, $MI |
| vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2 |
| vpxor $TMP2, $MI, $MI |
| vpclmulqdq \$0x11, $TMP0, $AESDATA0, $TMP2 |
| vpxor $TMP2, $HI, $HI |
| |
| .Lreduce$local_label_suffix: |
| # Finally, do the GHASH reduction. |
| vbroadcasti128 .Lgfpoly(%rip), $TMP0 |
| vpclmulqdq \$0x01, $LO, $TMP0, $TMP1 |
| vpshufd \$0x4e, $LO, $LO |
| vpxor $LO, $MI, $MI |
| vpxor $TMP1, $MI, $MI |
| vpclmulqdq \$0x01, $MI, $TMP0, $TMP1 |
| vpshufd \$0x4e, $MI, $MI |
| vpxor $MI, $HI, $HI |
| vpxor $TMP1, $HI, $HI |
| vextracti128 \$1, $HI, $GHASH_ACC_XMM |
| vpxor $HI_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM |
| |
| .Ldone$local_label_suffix: |
| # Store the updated GHASH accumulator back to memory. |
| vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM |
| vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) |
| |
| vzeroupper |
| ___ |
| return $code; |
| } |
| |
| $code .= _begin_func "aes_gcm_enc_update_vaes_avx2", 1; |
| $code .= _aes_gcm_update 1; |
| $code .= _end_func; |
| |
| $code .= _begin_func "aes_gcm_dec_update_vaes_avx2", 1; |
| $code .= _aes_gcm_update 0; |
| $code .= _end_func; |
| |
| print $code; |
| close STDOUT or die "error closing STDOUT: $!"; |
| exit 0; |