| #! /usr/bin/env perl |
| |
| # Copyright (c) 2022, ARM Inc. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # https://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| #======================================================================== |
| # Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project, |
| # derived from https://github.com/ARM-software/AArch64cryptolib, original |
| # author Samuel Lee <Samuel.Lee@arm.com>. |
| #======================================================================== |
| # |
| # Approach - assume we don't want to reload constants, so reserve ~half of |
| # vector register file for constants |
| # |
| # main loop to act on 4 16B blocks per iteration, and then do modulo of the |
| # accumulated intermediate hashes from the 4 blocks |
| # |
| # ____________________________________________________ |
| # | | |
| # | PRE | |
| # |____________________________________________________| |
| # | | | | |
| # | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 | |
| # |________________|________________|__________________| |
| # | | | | |
| # | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 | |
| # |________________|________________|__________________| |
| # | | | | |
| # | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 | |
| # |________________|________________|__________________| |
| # | | | | |
| # | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 | |
| # |________________|____(mostly)____|__________________| |
| # | | |
| # | MODULO | |
| # |____________________________________________________| |
| # |
| # PRE: Ensure previous generated intermediate hash is aligned and merged with |
| # result for GHASH 4k+0 |
| # |
| # EXT low_acc, low_acc, low_acc, #8 |
| # EOR res_curr (4k+0), res_curr (4k+0), low_acc |
| # |
| # CTR block: Increment and byte reverse counter in scalar registers and transfer |
| # to SIMD registers |
| # |
| # REV ctr32, rev_ctr32 |
| # ORR ctr64, constctr96_top32, ctr32, LSL #32 |
| # // Keeping this in scalar registers to free up space in SIMD RF |
| # INS ctr_next.d[0], constctr96_bottom64 |
| # INS ctr_next.d[1], ctr64X |
| # ADD rev_ctr32, #1 |
| # |
| # AES block: |
| # |
| # Do AES encryption/decryption on CTR block X and EOR it with input block X. |
| # Take 256 bytes key below for example. Doing small trick here of loading input |
| # in scalar registers, EORing with last key and then transferring Given we are |
| # very constrained in our ASIMD registers this is quite important |
| # |
| # Encrypt: |
| # LDR input_low, [ input_ptr ], #8 |
| # LDR input_high, [ input_ptr ], #8 |
| # EOR input_low, k14_low |
| # EOR input_high, k14_high |
| # INS res_curr.d[0], input_low |
| # INS res_curr.d[1], input_high |
| # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k13 |
| # EOR res_curr, res_curr, ctr_curr |
| # ST1 { res_curr.16b }, [ output_ptr ], #16 |
| # |
| # Decrypt: |
| # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr |
| # AESE ctr_curr, k13 |
| # LDR res_curr, [ input_ptr ], #16 |
| # EOR res_curr, res_curr, ctr_curr |
| # MOV output_low, res_curr.d[0] |
| # MOV output_high, res_curr.d[1] |
| # EOR output_low, k14_low |
| # EOR output_high, k14_high |
| # STP output_low, output_high, [ output_ptr ], #16 |
| # |
| # GHASH block X: |
| # Do 128b karatsuba polynomial multiplication on block. We only have |
| # 64b->128b polynomial multipliers, naively that means we need to do 4 64b |
| # multiplies to generate a 128b. |
| # |
| # multiplication: |
| # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ |
| # (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64 |
| # |
| # The idea behind Karatsuba multiplication is that we can do just 3 64b |
| # multiplies: |
| # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ |
| # (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ |
| # Pmull(Al,Bl))<<64 |
| # |
| # There is some complication here because the bit order of GHASH's PMULL is |
| # reversed compared to elsewhere, so we are multiplying with "twisted" |
| # powers of H |
| # |
| # Note: We can PMULL directly into the acc_x in first GHASH of the loop |
| # |
| # Note: For scheduling big cores we want to split the processing to happen over |
| # two loop iterations - otherwise the critical path latency dominates the |
| # performance. |
| # |
| # This has a knock on effect on register pressure, so we have to be a bit |
| # more clever with our temporary registers than indicated here |
| # |
| # REV64 res_curr, res_curr |
| # INS t_m.d[0], res_curr.d[1] |
| # EOR t_m.8B, t_m.8B, res_curr.8B |
| # PMULL2 t_h, res_curr, HX |
| # PMULL t_l, res_curr, HX |
| # PMULL t_m, t_m, HX_k |
| # EOR acc_h, acc_h, t_h |
| # EOR acc_l, acc_l, t_l |
| # EOR acc_m, acc_m, t_m |
| # |
| # MODULO: take the partial accumulators (~representing sum of 256b |
| # multiplication results), from GHASH and do modulo reduction on them |
| # There is some complication here because the bit order of GHASH's |
| # PMULL is reversed compared to elsewhere, so we are doing modulo with |
| # a reversed constant |
| # |
| # EOR acc_m, acc_m, acc_h |
| # EOR acc_m, acc_m, acc_l // Finish off karatsuba processing |
| # PMULL t_mod, acc_h, mod_constant |
| # EXT acc_h, acc_h, acc_h, #8 |
| # EOR acc_m, acc_m, acc_h |
| # EOR acc_m, acc_m, t_mod |
| # PMULL acc_h, acc_m, mod_constant |
| # EXT acc_m, acc_m, acc_m, #8 |
| # EOR acc_l, acc_l, acc_h |
| # EOR acc_l, acc_l, acc_m |
| # |
| # Key Optimizations: |
| # |
| # NOTE: This implementation is heavily NEON-bound due to the large amount of |
| # vector computation. The original code from which this was derived often |
| # avoided unnecessary Load/Store operations, but at the expense of |
| # increased NEON pipeline pressure and more GPR <-> NEON data movements. |
| # The primary goal of the optimizations applied here is to reduce NEON |
| # pipeline pressure. This includes minimizing NEON register-to-register moves, |
| # GPR to NEON transfers (and vice versa), and other operations that consume |
| # execution slots on the NEON pipes. |
| # |
| # 1. Merged Kernels: AES-128, AES-192, and AES-256 GCM encryption and |
| # decryption are handled by single kernels each, reducing code size. |
| # Conditional branching is used for the key-size specific rounds. |
| # |
| # 2. Aggressive Pipelining: GHASH and AES operations for different blocks |
| # (4k to 4k+3) are heavily interleaved within the main loop. GHASH for |
| # block `n` starts while AES for block `n` is still in progress, and |
| # AES for block `n+4` begins. This hides instruction latencies. |
| # |
| # 3. Optimized Counter Handling: Instead of incrementing, reversing, and |
| # moving the counter to NEON registers for each of the four blocks in |
| # every loop iteration, we precompute and cache counter values on the |
| # stack. |
| # - The lower 96 bits of the counter are constant across the four blocks |
| # within an iteration. |
| # - The upper 32 bits (the byte-swapped incrementing part) are calculated |
| # for all four blocks (N, N+1, N+2, N+3) and stored on the stack at |
| # the start of the loop. |
| # - Values are calculated one loop iteration ahead of time. That is, |
| # immediately after loading values N-N+3 we calculate and store values |
| # N+4-N+7. This ensures that the data is available when loaded without |
| # store-load forwarding. |
| # - This strategy eliminates repeated scalar-to-vector transfers (FMOV) |
| # and scalar increment/reverse operations inside the tight loop. |
| # |
| # 4. Reduced Scalar-NEON Transfers: |
| # - For encryption, the final XOR with the last round key (rkN) and the |
| # AES-encrypted counter is done entirely in NEON registers using |
| # EOR or PLATFORM_EOR3, avoiding costly FMOV instructions to move |
| # plaintext data from GPRs to NEON after partial operations. A similar |
| # optimization is used for decryption. |
| # |
| # 5. EOR3 Optimization: The PLATFORM_EOR3 macro allows using the single |
| # EOR3 instruction (if available) to replace two EOR instructions, |
| # i.e., D = A ^ B ^ C. This is used in both GHASH accumulator updates |
| # and the final AES output XOR. Both versions of encrypt and decrypt are |
| # stored in the same assembly file. |
| # |
| # 6. Instruction scheduling was significantly modified both to directly improve |
| # performance as well as to remove false dependencies when that allowed |
| # additional improvements. There is still a bit of additional speedup |
| # possible but would be easiest to extract with platform-specific |
| # implementations. |
| # |
| use strict; |
| use warnings; |
| |
| my $flavour = shift; |
| my $output = shift; |
| |
| if (!defined $flavour) { |
| die "Usage: $0 <flavour> [output_filename]\n"; |
| } |
| |
| $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; |
| my $xlate; |
| ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or |
| ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or |
| die "can't locate arm-xlate.pl"; |
| |
| open OUT, "|-", $^X, $xlate, $flavour, $output; |
| *STDOUT=*OUT; |
| |
| # Converts an EOR3 pseudo-instruction into either the raw EOR3 encoding or into |
| # two EOR2 instructions. |
| sub process_eor3_match { |
| my ($vd_str, $vn_str, $vm_str, $va_str, $is_eor3) = @_; |
| |
| if ($is_eor3) { |
| # Directly emit the EOR3 mnemonic. |
| # Requires: .arch armv8.2-a+sha3 OR -march=armv8.2-a+sha3 |
| return sprintf(" eor3 %s.16b, %s.16b, %s.16b, %s.16b", |
| $vd_str, $vn_str, $vm_str, $va_str); |
| } else { |
| # Fallback: Two standard NEON EOR instructions |
| # Result = (Vn ^ Vm) ^ Va |
| # 1. Vd = Vn ^ Vm |
| # 2. Vd = Vd ^ Va |
| return sprintf(" eor %s.16b, %s.16b, %s.16b\n" . |
| " eor %s.16b, %s.16b, %s.16b", |
| $vd_str, $vn_str, $vm_str, # Step 1 |
| $vd_str, $vd_str, $va_str); # Step 2 |
| } |
| } |
| |
| # Generates the code string for a specific version (Standard or EOR3). |
| # Performs regex substitutions for EOR3 instructions and symbol renaming. |
| sub get_transformed_code { |
| my ($is_eor3, $template) = @_; |
| my $temp_code = $template; |
| |
| if ($is_eor3) { |
| # Suffix function names |
| $temp_code =~ s/\b(aes_gcm_enc_kernel)\b/${1}_eor3/g; |
| $temp_code =~ s/\b(aes_gcm_dec_kernel)\b/${1}_eor3/g; |
| # Suffix local labels |
| $temp_code =~ s/(\.L[a-zA-Z0-9_]+)/$1_eor3/g; |
| } |
| |
| $temp_code =~ s/PLATFORM_EOR3\s*\(\s*([vV0-9]+)\s*,\s*([vV0-9]+)\s*,\s*([vV0-9]+)\s*,\s*([vV0-9]+)\s*\)/ |
| process_eor3_match($1, $2, $3, $4, $is_eor3)/ge; |
| |
| return $temp_code; |
| } |
| |
| { |
| { |
| # We attempt to minimize the number of aliases for the same register but |
| # unfortunately sometimes you have to specify a Q or D register. |
| |
| ## AArch64 General-Purpose Registers (GPRs) |
| my $input_ptr = "x0"; # IN: Pointer to input plaintext |
| my $bit_length = "x1"; # IN: Length of input in bits |
| my $output_ptr = "x2"; # IN: Pointer to output ciphertext |
| my $current_tag = "x3"; # IN: Pointer to 16-byte GCM tag (T) |
| my $end_input_ptr = "x4"; # Calculated end of all input |
| my $main_end_input_ptr = "x5"; # Calculated end of the main 4-block loop |
| my $Htable = "x6"; # Pointer to GHASH key table (H) |
| my $key_ptr = "x7"; # Pointer to end of AES key schedule |
| my $cc = "x8"; # Pointer to AES key schedule (context) |
| |
| # Re-purposed registers for tail/scalar handling |
| my $input_l0 = "x6"; |
| my $input_h0 = "x7"; |
| |
| my $T32 = "x10"; # Holds scalar 64-bit counter value |
| my $W32 = "w10"; # Holds scalar 32-bit counter value |
| my $W_T32 = "w12"; # Holds reversed scalar 32-bit counter |
| my $rkN_l = "x13"; # Holds low 64 bits of final round key |
| my $rkN_h = "x14"; # Holds high 64 bits of final round key |
| my $len = "x15"; # Byte length of input |
| my $counter = "x16"; # Holds pointer to the 16-byte counter block (CTR) |
| my $rounds = "x17"; # Number of AES rounds (scalar) |
| my $rounds_w = "w17"; # Number of AES rounds (scalar) |
| my $tmp_gpr_w = "w20"; # Scratch register (word) |
| my $mod_constantx = "x21"; # Holds GHASH modulus 0xc2... (scalar) |
| |
| ## NEON/SIMD Registers (v0-v31) |
| my ($ctr0, $ctr1, $ctr2, $ctr3) = map("v$_", (0..3)); |
| my ($ctr0q, $ctr1q, $ctr2q, $ctr3q) = map("q$_", (0..3)); |
| my $ctr0d = "d0"; |
| |
| my ($res0, $res1, $res2, $res3) = map("v$_", (4..7)); |
| my ($res0q, $res1q, $res2q, $res3q) = map("q$_", (4..7)); |
| my ($res0d, $res1d, $res2d, $res3d) = map("d$_", (4..7)); |
| |
| my ($acc_h, $acc_m, $acc_l) = map("v$_", (9..11)); |
| |
| my ($h1, $h2, $h3, $h4) = map("v$_", (12..15)); |
| my ($h1q, $h2q, $h3q, $h4q) = map("q$_", (12..15)); |
| |
| my ($h12k, $h34k) = map("v$_", (16..17)); # H[1]^H[2] and H[3]^H[4] |
| |
| my ($rk0, $rk1, $rk2, $rk3, $rk4, $rk5, $rk6, $rk7, $rk8) = map("v$_", (18..26)); |
| my ($rk0q, $rk1q, $rk2q, $rk3q, $rk4q, $rk5q, $rk6q, $rk7q, $rk8q) = map("q$_", (18..26)); |
| |
| my $rk9_11_tmp = "v27"; # Used for rk9 (if AES-192) and rk11 (if AES-256) |
| my $rk10_12 = "v28"; # Used for rk10 (if AES-192) and rk12 (if AES-256) |
| my $rk9_11_tmpq = "q27"; |
| my $rk10_12q = "q28"; |
| |
| my $rkNm1 = "v31"; # Round N-1 key |
| my $rkNm1q = "q31"; |
| |
| my $mod_constant = "v8"; |
| my $mod_constantd = "d8"; |
| |
| my $final_block_dest = "v18"; |
| my $final_block_destq = "q18"; |
| |
| my ($ghash_t0, $ghash_t1, $ghash_t2, $ghash_t3) = map("v$_", (20..23)); |
| my ($ghash_t0d, $ghash_t1d, $ghash_t2d, $ghash_t3d) = map("d$_", (20..23)); |
| |
| my $v_rkN = "v30"; |
| my $v_rkNq = "q30"; |
| |
| # Locations on the stack |
| my $mod_constant_sp_offset = 128; |
| my $ctr0_sp_offset = 160; |
| my $ctr1_sp_offset = 176; |
| my $ctr2_sp_offset = 192; |
| my $ctr3_sp_offset = 208; |
| |
| # Registers specific to dec_kernel |
| my ($input_l1, $input_h1) = map("x$_", (19..20)); |
| my $ctr32w = "w9"; |
| my ($ctr32x, $ctr96_b64x) = map("x$_", (9..10)); |
| my $acc_md = "d10"; |
| |
| # Decryption temporary NEON registers |
| my $t0 = "v8"; my $t0d = "d8"; |
| my $t1 = "v4"; my $t1d = "d4"; |
| my $t2 = "v8"; my $t2d = "d8"; |
| my $t3 = "v4"; my $t3d = "d4"; |
| my $t4 = "v4"; my $t4d = "d4"; |
| my $t5 = "v5"; my $t5d = "d5"; |
| my $t6 = "v8"; my $t6d = "d8"; |
| my $t7 = "v5"; my $t7d = "d5"; |
| my $t8 = "v4"; my $t8d = "d8"; |
| my $t9 = "v6"; my $t9d = "d6"; |
| |
| my ($ctr_t0, $ctr_t1, $ctr_t2, $ctr_t3) = map("v$_", (4..7)); |
| my $mod_t = "v7"; |
| |
| # Decryption key registers |
| my $rk2q1 = "v20.1q"; |
| my $rk3q1 = "v21.1q"; |
| my $rk4v = "v22"; |
| my $rk4d = "d22"; |
| |
| my ($output_l1, $output_h1, $output_l2, $output_h2, $output_l3, $output_h3) = map("x$_", (19..24)); |
| my ($output_l0, $output_h0) = map("x$_", (6..7)); |
| |
| # --- Assemble the entire template --- |
| my $code_template = ""; |
| my $header_directives = <<'___'; |
| #if __ARM_MAX_ARCH__ >= 8 |
| .arch armv8.2-a+crypto+sha3 |
| .text |
| ___ |
| |
| $code_template .= <<"___"; |
| .global aes_gcm_enc_kernel |
| .type aes_gcm_enc_kernel,%function |
| .align 4 |
| aes_gcm_enc_kernel: |
| AARCH64_SIGN_LINK_REGISTER |
| stp x29, x30, [sp, #-224]! |
| mov x29, sp |
| ld1 { $ctr0.16b}, [x4] // Load initial counter block |
| stp x19, x20, [sp, #16] |
| mov $ctr1.16b, $ctr0.16b // Initialize ctr1-3 from ctr0 |
| mov $ctr2.16b, $ctr0.16b |
| mov $ctr3.16b, $ctr0.16b |
| mov $counter, x4 // Pointer to counter block in memory |
| mov $cc, x5 // Pointer to AES key schedule context |
| stp $mod_constantx, x22, [sp, #32] |
| // [sp, #48] is unused but allocated to align the stack layout with aes_gcm_dec_kernel |
| stp d8, d9, [sp, #64] // Save Neon registers |
| stp d10, d11, [sp, #80] |
| stp d12, d13, [sp, #96] |
| stp d14, d15, [sp, #112] |
| ldr $rounds_w, [$cc, #240] // Load number of AES rounds |
| add $key_ptr, $cc, $rounds, lsl #4 // Calculate pointer to the last round key |
| ldp $rkN_l, $rkN_h, [$key_ptr] // load round N key (for final XOR) |
| ldr $rkNm1q, [$key_ptr, #-16] // load round N-1 key |
| add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // Calculate end of input |
| lsr $main_end_input_ptr, $bit_length, #3 // Total byte length |
| mov $len, $main_end_input_ptr |
| ldr $W_T32, [$counter, #12] // Load counter's low 32 bits |
| sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1 |
| ldr $rk0q, [$cc, #0] // load rk0 |
| and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // Align main loop end to a multiple of 64 bytes |
| add $main_end_input_ptr, $main_end_input_ptr, $input_ptr |
| rev $W_T32, $W_T32 // Reverse for big-endian increment |
| uxtw $T32, $W_T32 // Zero extend reversed w12 into x10 for final counter update |
| // Pre-compute this value instead of using two instructions to reconstruct it every iteration |
| mov $mod_constantx, #0xc200000000000000 // GHASH reduction constant |
| str $mod_constantx, [sp, #$mod_constant_sp_offset] |
| // We maintain four copies of ctr values on the stack. Each loop iteration we |
| // store the updated ctr value to the last four bytes (e.g., $ctr0_sp_offset + 12). |
| // We then load the four values. This avoids a singificant number of |
| // expensive GPR->NEON and NEON->NEON moves. To avoid LDST forwarding we |
| // calculate and store the values one iteration ahead so they have time to |
| // drain before we load them. |
| str $ctr0q, [sp, #$ctr0_sp_offset] // Store base counter for block 0-3 |
| str $ctr0q, [sp, #$ctr1_sp_offset] |
| str $ctr0q, [sp, #$ctr2_sp_offset] |
| str $ctr0q, [sp, #$ctr3_sp_offset] |
| // Since we need the values right away don't go through the stack this first |
| // time. Manually insert the incremented big-endian counter values. |
| rev $tmp_gpr_w, $W_T32 |
| mov $ctr0.s[3], $tmp_gpr_w // ctr0 + 0 |
| add $tmp_gpr_w, $W_T32, #1 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| mov $ctr1.s[3], $tmp_gpr_w // ctr0 + 1 |
| add $tmp_gpr_w, $W_T32, #2 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| mov $ctr2.s[3], $tmp_gpr_w // ctr0 + 2 |
| add $tmp_gpr_w, $W_T32, #3 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| mov $ctr3.s[3], $tmp_gpr_w // ctr0 + 3 |
| // Calculate the ctr values for the *next* (not current) group of four |
| // blocks. Store the incremented parts to the stack. |
| add $tmp_gpr_w, $W_T32, #4 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}] // ctr0 + 4 for next iter |
| add $tmp_gpr_w, $W_T32, #5 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}] // ctr0 + 5 for next iter |
| add $tmp_gpr_w, $W_T32, #6 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}] // ctr0 + 6 for next iter |
| add $tmp_gpr_w, $W_T32, #7 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}] // ctr0 + 7 for next iter |
| add $W_T32, $W_T32, #8 // Advance counter past these two sets |
| // --- Start AES for first 4 blocks --- |
| aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 0 |
| ldp $rk1q, $rk2q, [$cc, #16] // load rk1, rk2 |
| aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 0 |
| ldp $rk3q, $rk4q, [$cc, #48] // load rk3, rk4 |
| aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 0 |
| ldp $rk5q, $rk6q, [$cc, #80] // load rk5, rk6 |
| aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 0 |
| ldp $h2q, $h3q, [$Htable, #32] // load H2, H3 (GHASH keys) |
| aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 1 |
| ldp $rk7q, $rk8q, [$cc, #112] // load rk7, rk8 |
| aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 1 |
| aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 1 |
| aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 1 |
| aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 2 |
| ext $h3.16b, $h3.16b, $h3.16b, #8 // Byte swap H3 for GHASH |
| aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 2 |
| ext $h2.16b, $h2.16b, $h2.16b, #8 // Byte swap H2 for GHASH |
| aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 2 |
| ldr $h4q, [$Htable, #80] // load H4 |
| aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 2 |
| ext $h4.16b, $h4.16b, $h4.16b, #8 // Byte swap H4 for GHASH |
| aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 3 |
| ld1 { $acc_l.16b}, [$current_tag] // Load initial GHASH accumulator (T) |
| aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 3 |
| ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // Byte swap T for GHASH |
| aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 3 |
| rev64 $acc_l.16b, $acc_l.16b // Correct byte order within 64-bit lanes |
| aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 3 |
| trn2 $h34k.2d, $h3.2d, $h4.2d // Karatsuba key: H4_low | H3_low |
| aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 4 |
| ldr $h1q, [$Htable] // load H1 |
| aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 4 |
| ext $h1.16b, $h1.16b, $h1.16b, #8 // Byte swap H1 for GHASH |
| aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 4 |
| trn1 $acc_h.2d, $h3.2d, $h4.2d // Karatsuba key: H4_high | H3_high |
| aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 4 |
| trn2 $h12k.2d, $h1.2d, $h2.2d // Karatsuba key: H2_low | H1_low |
| aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 5 |
| ldr $v_rkNq, [$key_ptr] // Preload round N key for final EOR |
| aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 5 |
| aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 5 |
| aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 5 |
| aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 6 |
| aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 6 |
| aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 6 |
| aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 6 |
| aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 7 |
| aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 7 |
| aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 7 |
| aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 7 |
| aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 8 |
| aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 8 |
| aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 8 |
| aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 8 |
| cmp $rounds, #12 // setup flags for AES-128/192/256 check |
| b.lt .Lenc_finish_first_blocks // branch if AES-128 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 9 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 9 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 9 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 9 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 10 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 10 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 10 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 10 |
| b.eq .Lenc_finish_first_blocks // branch if AES-192 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 11 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 11 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 11 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 11 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 12 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 12 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 12 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 12 |
| .Lenc_finish_first_blocks: |
| cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks to process in the tail |
| eor $h34k.16b, $h34k.16b, $acc_h.16b // Karatsuba key: H3^H4 |
| aese $ctr0.16b, $rkNm1.16b // AES block 0 - round N-1 |
| aese $ctr1.16b, $rkNm1.16b // AES block 1 - round N-1 |
| aese $ctr2.16b, $rkNm1.16b // AES block 2 - round N-1 |
| aese $ctr3.16b, $rkNm1.16b // AES block 3 - round N-1 |
| trn1 $mod_constant.2d, $h1.2d, $h2.2d // Karatsuba key: H2_high | H1_high |
| eor $h12k.16b, $h12k.16b, $mod_constant.16b // Karatsuba key: H1^H2 |
| b.ge .Lenc_tail // handle tail if no more full 4-block sets |
| ldp $res2q, $res3q, [$input_ptr, #32] // AES blocks 2,3 load plaintext |
| ldp $res0q, $res1q, [$input_ptr], #64 // AES blocks 0,1 load plaintext |
| // Compute and store first 4 ciphertext blocks |
| PLATFORM_EOR3($res0, $res0, $v_rkN, $ctr0) // AES block 0 - result = PT ^ AES(ctr0) |
| PLATFORM_EOR3($res1, $res1, $v_rkN, $ctr1) // AES block 1 - result = PT ^ AES(ctr1) |
| PLATFORM_EOR3($res2, $res2, $v_rkN, $ctr2) // AES block 2 - result = PT ^ AES(ctr2) |
| PLATFORM_EOR3($res3, $res3, $v_rkN, $ctr3) // AES block 3 - result = PT ^ AES(ctr3) |
| st1 { $res0.16b, $res1.16b, $res2.16b, $res3.16b}, [$output_ptr], #64 // AES blocks 0-3 - store result |
| // Load counter values for the second iteration from the stack |
| ldp $ctr0q, $ctr1q, [sp, #$ctr0_sp_offset] |
| ldp $ctr2q, $ctr3q, [sp, #$ctr2_sp_offset] |
| // Prepare and store counter values for the third iteration |
| rev $tmp_gpr_w, $W_T32 |
| str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}] // ctr + 8 |
| add $tmp_gpr_w, $W_T32, #1 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}] // ctr + 9 |
| add $tmp_gpr_w, $W_T32, #2 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}] // ctr + 10 |
| add $tmp_gpr_w, $W_T32, #3 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}] // ctr + 11 |
| add $W_T32, $W_T32, #4 // Advance counter base |
| cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks remaining |
| b.ge .Lenc_prepretail // go to prepretail if < 2 full loops left |
| .Lenc_main_loop: // main loop start (processes 4 blocks per iteration) |
| // --- AES Pipeline for blocks 4k+4 to 4k+7 --- |
| aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 0 |
| aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 0 |
| aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 0 |
| aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 0 |
| ldr $mod_constantd, [sp, #$mod_constant_sp_offset] // Load GHASH reduction constant |
| aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 1 |
| aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 1 |
| aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 1 |
| aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 1 |
| // --- GHASH Pipeline (interleaved with AES) for blocks 4k to 4k+3 --- |
| rev64 $res0.16b, $res0.16b // GHASH block 4k - Byte swap CT |
| rev64 $res1.16b, $res1.16b // GHASH block 4k+1 - Byte swap CT |
| rev64 $res2.16b, $res2.16b // GHASH block 4k+2 - Byte swap CT |
| rev64 $res3.16b, $res3.16b // GHASH block 4k+3 - Byte swap CT |
| aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 2 |
| ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // GHASH - prepare acc for XOR |
| aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 2 |
| eor $res0.16b, $res0.16b, $acc_l.16b // GHASH block 4k - Y_i = CT_i ^ Y_{i-1} |
| aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 2 |
| pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low |
| aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 2 |
| pmull2 $acc_h.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high |
| aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 3 |
| mov $acc_md, $h34k.d[1] // GHASH block 4k - mid Karatsuba key |
| aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 3 |
| mov $ghash_t0d, $res0.d[1] // GHASH block 4k - mid |
| aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 3 |
| eor $ghash_t0.8b, $ghash_t0.8b, $res0.8b // GHASH block 4k - mid |
| aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 3 |
| mov $ghash_t1d, $res1.d[1] // GHASH block 4k+1 - mid |
| aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 4 |
| eor $ghash_t1.8b, $ghash_t1.8b, $res1.8b // GHASH block 4k+1 - mid |
| aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 4 |
| pmull $acc_m.1q, $ghash_t0.1d, $acc_m.1d // GHASH block 4k - mid |
| aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 4 |
| pmull $ghash_t1.1q, $ghash_t1.1d, $h34k.1d // GHASH block 4k+1 - mid |
| aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 4 |
| eor $acc_m.16b, $acc_m.16b, $ghash_t1.16b // GHASH block 4k+1 - mid |
| aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 5 |
| ext $ghash_t2.16b, $ghash_t2.16b, $res2.16b, #8 // GHASH block 4k+2 - mid |
| aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 5 |
| eor $ghash_t2.16b, $ghash_t2.16b, $res2.16b // GHASH block 4k+2 - mid |
| aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 5 |
| pmull2 $ghash_t2.1q, $ghash_t2.2d, $h12k.2d // GHASH block 4k+2 - mid |
| aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 5 |
| mov $ghash_t3d, $res3.d[1] // GHASH block 4k+3 - mid |
| aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 6 |
| eor $ghash_t3.8b, $ghash_t3.8b, $res3.8b // GHASH block 4k+3 - mid |
| aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 6 |
| pmull $ghash_t3.1q, $ghash_t3.1d, $h12k.1d // GHASH block 4k+3 - mid |
| aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 6 |
| PLATFORM_EOR3($acc_m, $acc_m, $ghash_t2, $ghash_t3) // GHASH block 4k+2/3 - mid |
| aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 6 |
| pmull2 $ghash_t2.1q, $res0.2d, $h4.2d // GHASH block 4k - high |
| aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 7 |
| pmull2 $ghash_t1.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high |
| eor $ghash_t2.16b, $ghash_t2.16b, $ghash_t1.16b // GHASH block 4k+3 - high |
| aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 7 |
| pmull2 $ghash_t3.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high |
| aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 7 |
| pmull $ghash_t1.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low |
| aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 7 |
| pmull $ghash_t0.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low |
| PLATFORM_EOR3($acc_h, $acc_h, $ghash_t2, $ghash_t3) // GHASH block 4k/1/2/3 - high |
| pmull $ghash_t2.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low |
| ldp $res2q, $res3q, [$input_ptr, #32] |
| ldp $res0q, $res1q, [$input_ptr], #64 |
| eor $ghash_t0.16b, $ghash_t0.16b, $ghash_t1.16b // GHASH block 4k+1 - low |
| aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 8 |
| PLATFORM_EOR3($acc_l, $acc_l, $ghash_t2, $ghash_t0) // GHASH block 4k/1/2/3 - low |
| aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 8 |
| PLATFORM_EOR3($acc_m, $acc_m, $acc_h, $acc_l) // MODULO - karatsuba tidy up |
| aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 8 |
| pmull $ghash_t0.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid |
| aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 8 |
| cmp $rounds, #12 // setup flags for AES-128/192/256 check |
| b.lt .Lenc_main_loop_continue // branch if AES-128 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 9 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 9 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 9 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 9 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 10 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 10 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 10 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 10 |
| b.eq .Lenc_main_loop_continue // branch if AES-192 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 11 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 11 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 11 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 11 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 12 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 12 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 12 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 12 |
| .Lenc_main_loop_continue: |
| ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8 // MODULO - other top alignment |
| PLATFORM_EOR3($acc_m, $acc_m, $ghash_t0, $acc_h) // MODULO - fold into mid |
| pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low |
| ext $ghash_t0.16b, $acc_m.16b, $acc_m.16b, #8 // MODULO - other mid alignment |
| PLATFORM_EOR3($acc_l, $acc_h, $acc_l, $ghash_t0) // MODULO - fold into low |
| aese $ctr0.16b, $rkNm1.16b // AES block 4k+4 - round N-1 |
| PLATFORM_EOR3($res0, $res0, $v_rkN, $ctr0) // AES block 4k+4 - result |
| aese $ctr1.16b, $rkNm1.16b // AES block 4k+5 - round N-1 |
| PLATFORM_EOR3($res1, $res1, $v_rkN, $ctr1) // AES block 4k+5 - result |
| aese $ctr2.16b, $rkNm1.16b // AES block 4k+6 - round N-1 |
| PLATFORM_EOR3($res2, $res2, $v_rkN, $ctr2) // AES block 4k+6 - result |
| aese $ctr3.16b, $rkNm1.16b // AES block 4k+7 - round N-1 |
| PLATFORM_EOR3($res3, $res3, $v_rkN, $ctr3) // AES block 4k+7 - result |
| ldp $ctr0q, $ctr1q, [sp, #$ctr0_sp_offset] |
| ldp $ctr2q, $ctr3q, [sp, #$ctr2_sp_offset] |
| // We used these registers as temporaries above so reload the RKs. |
| ldp $rk2q, $rk3q, [$cc, #32] // load rk2, rk3 |
| ldp $rk4q, $rk5q, [$cc, #64] // load rk4, rk5 |
| st1 { $res0.16b, $res1.16b, $res2.16b, $res3.16b}, [$output_ptr], #64 // AES blocks 4k+4-7 - store result |
| rev $tmp_gpr_w, $W_T32 |
| str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $W_T32, #1 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $W_T32, #2 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $W_T32, #3 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}] |
| add $W_T32, $W_T32, #4 |
| cmp $input_ptr, $main_end_input_ptr // .LOOP CONTROL |
| b.lt .Lenc_main_loop |
| .Lenc_prepretail: // PREPRETAIL |
| aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 0 |
| rev64 $res2.16b, $res2.16b // GHASH block 2 |
| aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 0 |
| aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 0 |
| rev64 $res0.16b, $res0.16b // GHASH block 0 |
| ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // PRE 0 |
| aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 1 |
| aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 1 |
| eor $res0.16b, $res0.16b, $acc_l.16b // PRE 1 |
| rev64 $res1.16b, $res1.16b // GHASH block 1 |
| aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 2 |
| aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 0 |
| mov $acc_md, $h34k.d[1] // GHASH block 0 - mid Karatsuba key |
| aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 1 |
| pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 0 - low |
| mov $mod_constantd, $res0.d[1] // GHASH block 0 - mid |
| pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 0 - high |
| aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 3 |
| aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 2 |
| eor $mod_constant.8b, $mod_constant.8b, $res0.8b // GHASH block 0 - mid |
| aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 2 |
| aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 1 |
| aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 3 |
| pmull $acc_m.1q, $mod_constant.1d, $acc_m.1d // GHASH block 0 - mid |
| pmull2 $res0.1q, $res1.2d, $h3.2d // GHASH block 1 - high |
| pmull $mod_constant.1q, $res1.1d, $h3.1d // GHASH block 1 - low |
| aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 2 |
| eor $acc_h.16b, $acc_h.16b, $res0.16b // GHASH block 1 - high |
| mov $res0d, $res1.d[1] // GHASH block 1 - mid |
| aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 3 |
| eor $acc_l.16b, $acc_l.16b, $mod_constant.16b // GHASH block 1 - low |
| aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 3 |
| eor $res0.8b, $res0.8b, $res1.8b // GHASH block 1 - mid |
| mov $mod_constantd, $res2.d[1] // GHASH block 2 - mid |
| aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 4 |
| rev64 $res3.16b, $res3.16b // GHASH block 3 |
| aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 4 |
| pmull $res0.1q, $res0.1d, $h34k.1d // GHASH block 1 - mid |
| eor $mod_constant.8b, $mod_constant.8b, $res2.8b // GHASH block 2 - mid |
| pmull $res1.1q, $res2.1d, $h2.1d // GHASH block 2 - low |
| aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 5 |
| aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 4 |
| eor $acc_m.16b, $acc_m.16b, $res0.16b // GHASH block 1 - mid |
| pmull2 $res0.1q, $res2.2d, $h2.2d // GHASH block 2 - high |
| eor $acc_l.16b, $acc_l.16b, $res1.16b // GHASH block 2 - low |
| ins $mod_constant.d[1], $mod_constant.d[0] // GHASH block 2 - mid |
| aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 5 |
| eor $acc_h.16b, $acc_h.16b, $res0.16b // GHASH block 2 - high |
| mov $res0d, $res3.d[1] // GHASH block 3 - mid |
| aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 4 |
| pmull2 $mod_constant.1q, $mod_constant.2d, $h12k.2d // GHASH block 2 - mid |
| eor $res0.8b, $res0.8b, $res3.8b // GHASH block 3 - mid |
| pmull2 $res1.1q, $res3.2d, $h1.2d // GHASH block 3 - high |
| aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 5 |
| pmull $res0.1q, $res0.1d, $h12k.1d // GHASH block 3 - mid |
| eor $acc_m.16b, $acc_m.16b, $mod_constant.16b // GHASH block 2 - mid |
| aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 5 |
| aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 6 |
| aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 6 |
| aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 6 |
| aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 6 |
| aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 7 |
| eor $acc_h.16b, $acc_h.16b, $res1.16b // GHASH block 3 - high |
| aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 7 |
| aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 7 |
| ldr $mod_constantd, [sp, #$mod_constant_sp_offset] |
| aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 8 |
| eor $acc_m.16b, $acc_m.16b, $res0.16b // GHASH block 3 - mid |
| pmull $res2.1q, $res3.1d, $h1.1d // GHASH block 3 - low |
| aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 8 |
| cmp $rounds, #12 // setup flags for AES-128/192/256 check |
| aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 8 |
| eor $acc_l.16b, $acc_l.16b, $res2.16b // GHASH block 3 - low |
| aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 7 |
| eor $acc_m.16b, $acc_m.16b, $acc_h.16b // karatsuba tidy up |
| aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 8 |
| pmull $res0.1q, $acc_h.1d, $mod_constant.1d |
| ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8 |
| eor $acc_m.16b, $acc_m.16b, $acc_l.16b |
| b.lt .Lenc_finish_prepretail // branch if AES-128 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 9 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 9 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 9 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 9 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 10 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 10 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 10 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 10 |
| b.eq .Lenc_finish_prepretail // branch if AES-192 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 11 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 11 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 11 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 11 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 12 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 12 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 12 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 12 |
| .Lenc_finish_prepretail: |
| aese $ctr0.16b, $rkNm1.16b // AES block 0 - round N-1 |
| aese $ctr1.16b, $rkNm1.16b // AES block 1 - round N-1 |
| aese $ctr2.16b, $rkNm1.16b // AES block 2 - round N-1 |
| aese $ctr3.16b, $rkNm1.16b // AES block 3 - round N-1 |
| PLATFORM_EOR3($acc_m, $acc_m, $res0, $acc_h) |
| pmull $res0.1q, $acc_m.1d, $mod_constant.1d |
| ext $acc_m.16b, $acc_m.16b, $acc_m.16b, #8 |
| PLATFORM_EOR3($acc_l, $acc_l, $res0, $acc_m) |
| .Lenc_tail: // TAIL: Process remaining 0 to 3 blocks |
| ext $mod_constant.16b, $acc_l.16b, $acc_l.16b, #8 // Save current GHASH state for partial tag feed-in |
| sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process |
| ldp $input_l0, $input_h0, [$input_ptr], #16 // AES block 0 - load plaintext |
| eor $input_l0, $input_l0, $rkN_l // AES block 0 - round N low |
| eor $input_h0, $input_h0, $rkN_h // AES block 0 - round N high |
| cmp $main_end_input_ptr, #48 |
| fmov $res0d, $input_l0 // AES block 0 - mov low |
| fmov $res0.d[1], $input_h0 // AES block 0 - mov high |
| eor $res1.16b, $res0.16b, $ctr0.16b // AES block 0 - result |
| b.gt .Lenc_blocks_more_than_3 |
| cmp $main_end_input_ptr, #32 |
| mov $ctr3.16b, $ctr2.16b |
| movi $acc_l.8b, #0 |
| movi $acc_h.8b, #0 |
| mov $ctr2.16b, $ctr1.16b |
| movi $acc_m.8b, #0 |
| b.gt .Lenc_blocks_more_than_2 |
| mov $ctr3.16b, $ctr1.16b |
| cmp $main_end_input_ptr, #16 |
| b.gt .Lenc_blocks_more_than_1 |
| b .Lenc_blocks_less_than_1 |
| .Lenc_blocks_more_than_3: // blocks left > 3 |
| st1 { $res1.16b}, [$output_ptr], #16 // AES final-2 block - store result |
| ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-2 block - load input low & high |
| rev64 $res0.16b, $res1.16b // GHASH final-3 block |
| eor $input_l0, $input_l0, $rkN_l // AES final-2 block - round N low |
| eor $res0.16b, $res0.16b, $mod_constant.16b // feed in partial tag |
| eor $input_h0, $input_h0, $rkN_h // AES final-2 block - round N high |
| mov $ghash_t2d, $res0.d[1] // GHASH final-3 block - mid |
| fmov $res1d, $input_l0 // AES final-2 block - mov low |
| fmov $res1.d[1], $input_h0 // AES final-2 block - mov high |
| eor $ghash_t2.8b, $ghash_t2.8b, $res0.8b // GHASH final-3 block - mid |
| movi $mod_constant.8b, #0 // suppress further partial tag feed in |
| mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid |
| pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low |
| pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high |
| pmull $acc_m.1q, $ghash_t2.1d, $acc_m.1d // GHASH final-3 block - mid |
| eor $res1.16b, $res1.16b, $ctr1.16b // AES final-2 block - result |
| .Lenc_blocks_more_than_2: // blocks left > 2 |
| st1 { $res1.16b}, [$output_ptr], #16 // AES final-2 block - store result |
| ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-1 block - load input low & high |
| rev64 $res0.16b, $res1.16b // GHASH final-2 block |
| eor $input_l0, $input_l0, $rkN_l // AES final-1 block - round N low |
| eor $res0.16b, $res0.16b, $mod_constant.16b // feed in partial tag |
| fmov $res1d, $input_l0 // AES final-1 block - mov low |
| eor $input_h0, $input_h0, $rkN_h // AES final-1 block - round N high |
| fmov $res1.d[1], $input_h0 // AES final-1 block - mov high |
| movi $mod_constant.8b, #0 // suppress further partial tag feed in |
| pmull2 $ghash_t0.1q, $res0.2d, $h3.2d // GHASH final-2 block - high |
| mov $ghash_t2d, $res0.d[1] // GHASH final-2 block - mid |
| pmull $ghash_t1.1q, $res0.1d, $h3.1d // GHASH final-2 block - low |
| eor $ghash_t2.8b, $ghash_t2.8b, $res0.8b // GHASH final-2 block - mid |
| eor $res1.16b, $res1.16b, $ctr2.16b // AES final-1 block - result |
| eor $acc_h.16b, $acc_h.16b, $ghash_t0.16b // GHASH final-2 block - high |
| pmull $ghash_t2.1q, $ghash_t2.1d, $h34k.1d // GHASH final-2 block - mid |
| eor $acc_l.16b, $acc_l.16b, $ghash_t1.16b // GHASH final-2 block - low |
| eor $acc_m.16b, $acc_m.16b, $ghash_t2.16b // GHASH final-2 block - mid |
| .Lenc_blocks_more_than_1: // blocks left > 1 |
| st1 { $res1.16b}, [$output_ptr], #16 // AES final-1 block - store result |
| rev64 $res0.16b, $res1.16b // GHASH final-1 block: Byte Swap CT |
| ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final block - load plaintext |
| eor $res0.16b, $res0.16b, $mod_constant.16b // Feed in partial tag |
| movi $mod_constant.8b, #0 // Clear for next block |
| eor $input_l0, $input_l0, $rkN_l // AES final block - round N low |
| mov $ghash_t2d, $res0.d[1] // GHASH final-1 block - mid |
| pmull2 $ghash_t0.1q, $res0.2d, $h2.2d // GHASH final-1 block - high |
| eor $input_h0, $input_h0, $rkN_h // AES final block - round N high |
| eor $ghash_t2.8b, $ghash_t2.8b, $res0.8b // GHASH final-1 block - mid |
| eor $acc_h.16b, $acc_h.16b, $ghash_t0.16b // GHASH final-1 block - high |
| ins $ghash_t2.d[1], $ghash_t2.d[0] // GHASH final-1 block - mid |
| fmov $res1d, $input_l0 // AES final block - mov low |
| fmov $res1.d[1], $input_h0 // AES final block - mov high |
| pmull2 $ghash_t2.1q, $ghash_t2.2d, $h12k.2d // GHASH final-1 block - mid |
| pmull $ghash_t1.1q, $res0.1d, $h2.1d // GHASH final-1 block - low |
| eor $res1.16b, $res1.16b, $ctr3.16b // AES final block - result |
| eor $acc_m.16b, $acc_m.16b, $ghash_t2.16b // GHASH final-1 block - mid |
| eor $acc_l.16b, $acc_l.16b, $ghash_t1.16b // GHASH final-1 block - low |
| .Lenc_blocks_less_than_1: // Last partial block handling |
| add $T32, $T32, $bit_length, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed |
| rev $W32, $W32 |
| str $W32, [$counter, #12] // store the updated counter |
| and $bit_length, $bit_length, #127 // bit_length %= 128 |
| mvn $rkN_l, xzr // Mask for low 64 bits |
| sub $bit_length, $bit_length, #128 // |
| neg $bit_length, $bit_length // Valid bits in the last block (1-128) |
| ldr $final_block_destq, [$output_ptr] // Load destination for merging |
| mvn $rkN_h, xzr // Mask for high 64 bits |
| and $bit_length, $bit_length, #127 // bit_length %= 128 |
| lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block |
| cmp $bit_length, #64 |
| csel $input_l0, $rkN_l, $rkN_h, lt |
| csel $input_h0, $rkN_h, xzr, lt |
| fmov $ctr0d, $input_l0 // ctr0d is mask for last block |
| fmov $ctr0.d[1], $input_h0 |
| and $res1.16b, $res1.16b, $ctr0.16b // Mask out unused bits of the last CT block |
| rev64 $res0.16b, $res1.16b // GHASH final block - byte swap |
| eor $res0.16b, $res0.16b, $mod_constant.16b // Feed in partial tag |
| bif $res1.16b, $final_block_dest.16b, $ctr0.16b // Bitwise Insert: merge with existing data at output_ptr |
| pmull2 $ghash_t0.1q, $res0.2d, $h1.2d // GHASH final block - high |
| mov $mod_constantd, $res0.d[1] // GHASH final block - mid |
| pmull $ghash_t1.1q, $res0.1d, $h1.1d // GHASH final block - low |
| eor $acc_h.16b, $acc_h.16b, $ghash_t0.16b // GHASH final block - high |
| eor $mod_constant.8b, $mod_constant.8b, $res0.8b // GHASH final block - mid |
| pmull $mod_constant.1q, $mod_constant.1d, $h12k.1d // GHASH final block - mid |
| eor $acc_l.16b, $acc_l.16b, $ghash_t1.16b // GHASH final block - low |
| eor $acc_m.16b, $acc_m.16b, $mod_constant.16b // GHASH final block - mid |
| eor $res0.16b, $acc_l.16b, $acc_h.16b // MODULO - karatsuba tidy up |
| fmov $mod_constantd, $mod_constantx |
| eor $acc_m.16b, $acc_m.16b, $res0.16b // MODULO - karatsuba tidy up |
| pmull $res3.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid |
| ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8 // MODULO - other top alignment |
| PLATFORM_EOR3($acc_m, $acc_m, $res3, $acc_h) // MODULO - fold into mid |
| pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low |
| ext $acc_m.16b, $acc_m.16b, $acc_m.16b, #8 // MODULO - other mid alignment |
| st1 { $res1.16b}, [$output_ptr] // store all 16B |
| PLATFORM_EOR3($acc_l, $acc_l, $acc_h, $acc_m) // MODULO - fold into low |
| ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // Byte swap GHASH result |
| rev64 $acc_l.16b, $acc_l.16b // Final Tag |
| mov $input_ptr, $len |
| st1 { $acc_l.16b }, [$current_tag] // Store final tag |
| ldp x19, x20, [sp, #16] |
| ldp $mod_constantx, x22, [sp, #32] |
| ldp d8, d9, [sp, #64] |
| ldp d10, d11, [sp, #80] |
| ldp d12, d13, [sp, #96] |
| ldp d14, d15, [sp, #112] |
| ldp x29, x30, [sp], #224 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| .size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel |
| ___ |
| ################################################################################ |
| # aes_gcm_dec_kernel |
| ################################################################################ |
| $code_template .= <<"___"; |
| .global aes_gcm_dec_kernel |
| .type aes_gcm_dec_kernel,%function |
| .align 4 |
| aes_gcm_dec_kernel: |
| AARCH64_SIGN_LINK_REGISTER |
| stp x29, x30, [sp, #-224]! |
| mov x29, sp |
| stp x19, x20, [sp, #16] |
| ld1 { $ctr0.16b}, [x4] |
| mov $ctr1.16b, $ctr0.16b |
| mov $ctr2.16b, $ctr0.16b |
| mov $ctr3.16b, $ctr0.16b |
| mov $counter, x4 |
| mov $cc, x5 |
| stp $mod_constantx, x22, [sp, #32] |
| stp x23, x24, [sp, #48] |
| stp d8, d9, [sp, #64] |
| stp d10, d11, [sp, #80] |
| stp d12, d13, [sp, #96] |
| stp d14, d15, [sp, #112] |
| ldr $rounds_w, [$cc, #240] // Load number of AES rounds |
| add $input_l1, $cc, $rounds, lsl #4 // borrow input_l1 for last key |
| ldp $rkN_l, $rkN_h, [$input_l1] // load round N keys |
| ldr $rkNm1q, [$input_l1, #-16] // load round N-1 keys |
| add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // end_input_ptr |
| lsr $main_end_input_ptr, $bit_length, #3 // byte_len |
| mov $len, $main_end_input_ptr |
| ldr $ctr32w, [$counter, #12] // Load scalar 32-bit counter (CTR) |
| sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1 |
| ldr $rk0q, [$cc, #0] // load rk0 |
| and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) |
| add $main_end_input_ptr, $main_end_input_ptr, $input_ptr |
| rev $ctr32w, $ctr32w // Reverse it once for big-endian incrementing |
| uxtw $T32, $ctr32w // Zero extend reversed w9 into x10 |
| str $ctr0q, [sp, #$ctr0_sp_offset] |
| str $ctr0q, [sp, #$ctr1_sp_offset] |
| str $ctr0q, [sp, #$ctr2_sp_offset] |
| str $ctr0q, [sp, #$ctr3_sp_offset] |
| rev $tmp_gpr_w, $ctr32w |
| mov $ctr0.s[3], $tmp_gpr_w |
| add $tmp_gpr_w, $ctr32w, #1 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| mov $ctr1.s[3], $tmp_gpr_w |
| add $tmp_gpr_w, $ctr32w, #2 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| mov $ctr2.s[3], $tmp_gpr_w |
| add $tmp_gpr_w, $ctr32w, #3 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| mov $ctr3.s[3], $tmp_gpr_w |
| add $tmp_gpr_w, $ctr32w, #4 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $ctr32w, #5 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $ctr32w, #6 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $ctr32w, #7 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}] |
| add $ctr32w, $ctr32w, #8 |
| // Pre-compute this value instead of using two instructions for moving and then shifting in the main loop |
| mov $mod_constantx, #0xc200000000000000 |
| str $mod_constantx, [sp, #$mod_constant_sp_offset] |
| aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 0 |
| ldp $rk1q, $rk2q, [$cc, #16] // load rk1, rk2 |
| aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 0 |
| ldp $rk3q, $rk4q, [$cc, #48] // load rk3, rk4 |
| aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 0 |
| ldp $rk5q, $rk6q, [$cc, #80] // load rk5, rk6 |
| aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 0 |
| ldp $h2q, $h3q, [$Htable, #32] // load h2, h3 |
| aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 1 |
| ldp $rk7q, $rk8q, [$cc, #112] // load rk7, rk8 |
| aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 1 |
| aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 1 |
| aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 1 |
| aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 2 |
| ext $h3.16b, $h3.16b, $h3.16b, #8 |
| aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 2 |
| ext $h2.16b, $h2.16b, $h2.16b, #8 |
| ldr $h4q, [$Htable, #80] // load h4 |
| aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 2 |
| aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 2 |
| ext $h4.16b, $h4.16b, $h4.16b, #8 |
| aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 3 |
| ld1 { $acc_l.16b}, [$current_tag] |
| aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 3 |
| ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 |
| aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 3 |
| rev64 $acc_l.16b, $acc_l.16b |
| aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 3 |
| trn2 $h34k.2d, $h3.2d, $h4.2d // h4l | h3l |
| aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 4 |
| ldr $h1q, [$Htable] // load h1 |
| aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 4 |
| ext $h1.16b, $h1.16b, $h1.16b, #8 |
| aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 4 |
| trn1 $acc_h.2d, $h3.2d, $h4.2d // h4h | h3h |
| aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 4 |
| trn2 $h12k.2d, $h1.2d, $h2.2d // h2l | h1l |
| aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 5 |
| aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 5 |
| aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 5 |
| aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 5 |
| aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 6 |
| aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 6 |
| aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 6 |
| aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 6 |
| aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 7 |
| aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 7 |
| aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 7 |
| aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 7 |
| aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 8 |
| aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 8 |
| aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 8 |
| aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 8 |
| cmp $rounds, #12 // setup flags for AES-128/192/256 check |
| b.lt .Ldec_finish_first_blocks // branch if AES-128 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 9 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 9 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 9 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 9 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 10 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 10 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 10 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 10 |
| b.eq .Ldec_finish_first_blocks // branch if AES-192 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 11 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 11 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 11 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 11 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 12 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 12 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 12 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 12 |
| .Ldec_finish_first_blocks: |
| ldr $rk9_11_tmpq, [$input_l1] // load rkN |
| cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks |
| eor $h34k.16b, $h34k.16b, $acc_h.16b // h4k | h3k |
| aese $ctr1.16b, $rkNm1.16b // AES block 1 - round N-1 |
| aese $ctr2.16b, $rkNm1.16b // AES block 2 - round N-1 |
| aese $ctr3.16b, $rkNm1.16b // AES block 3 - round N-1 |
| aese $ctr0.16b, $rkNm1.16b // AES block 0 - round N-1 |
| trn1 $t0.2d, $h1.2d, $h2.2d // h2h | h1h |
| eor $h12k.16b, $h12k.16b, $t0.16b // h2k | h1k |
| b.ge .Ldec_tail // handle tail |
| // Setup for AES blocks 0-3 is done purely on NEON side instead of mixing NEON and scalar instructions. |
| // This is because the final result of the AES block needs to be EORd with the final round key |
| // value ($v_rkN). This avoids several fmovs. |
| ldp $res2q, $res3q, [$input_ptr, #32] // AES blocks 2,3 load ciphertext |
| ldp $res0q, $res1q, [$input_ptr], #64 // AES blocks 0,1 load ciphertext |
| PLATFORM_EOR3($ctr0, $res0, $ctr0, $rk9_11_tmp) // AES block 0 - result |
| PLATFORM_EOR3($ctr1, $res1, $ctr1, $rk9_11_tmp) // AES block 1 - result |
| PLATFORM_EOR3($ctr2, $res2, $ctr2, $rk9_11_tmp) // AES block 2 - result |
| PLATFORM_EOR3($ctr3, $res3, $ctr3, $rk9_11_tmp) // AES block 3 - result |
| st1 { $ctr0.16b, $ctr1.16b, $ctr2.16b, $ctr3.16b}, [$output_ptr], #64 // AES blocks 0-3 - store result |
| ldr $ctr0q, [sp, #$ctr0_sp_offset] |
| ldr $ctr1q, [sp, #$ctr1_sp_offset] |
| ldr $ctr2q, [sp, #$ctr2_sp_offset] |
| ldr $ctr3q, [sp, #$ctr3_sp_offset] |
| rev $tmp_gpr_w, $ctr32w |
| str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $ctr32w, #1 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $ctr32w, #2 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $ctr32w, #3 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}] |
| add $ctr32w, $ctr32w, #4 |
| cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks |
| b.ge .Ldec_prepretail // do prepretail |
| .Ldec_main_loop: // main loop start |
| aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 0 |
| aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 0 |
| aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 0 |
| aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 0 |
| aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 1 |
| aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 1 |
| aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 1 |
| aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 1 |
| rev64 $res0.16b, $res0.16b // GHASH block 4k |
| rev64 $res1.16b, $res1.16b // GHASH block 4k+1 |
| rev64 $res2.16b, $res2.16b // GHASH block 4k+2 |
| rev64 $res3.16b, $res3.16b // GHASH block 4k+3 |
| aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 2 |
| ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // PRE 0 |
| aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 2 |
| eor $res0.16b, $res0.16b, $acc_l.16b // PRE 1 |
| aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 2 |
| pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low |
| aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 2 |
| pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high |
| aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 3 |
| mov $acc_md, $h34k.d[1] // GHASH block 4k - mid |
| aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 3 |
| mov $t0d, $res0.d[1] // GHASH block 4k - mid |
| aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 3 |
| eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid |
| aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 3 |
| aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 4 |
| pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high |
| aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 4 |
| eor $acc_h.16b, $acc_h.16b, $t1.16b // GHASH block 4k+1 - high |
| aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 4 |
| mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid |
| aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 4 |
| pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid |
| aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 5 |
| pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low |
| aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 5 |
| eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid |
| aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 5 |
| aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 5 |
| pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low |
| aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 6 |
| PLATFORM_EOR3($acc_l, $acc_l, $t2, $t5) // GHASH block 4k+1 - low & GHASH block 4k+2 - low |
| aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 6 |
| mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid |
| aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 6 |
| eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid |
| pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid |
| ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid |
| aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 6 |
| pmull2 $rk10_12.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high |
| mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid |
| aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 7 |
| pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid |
| aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 7 |
| pmull $rk9_11_tmp.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low |
| aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 7 |
| PLATFORM_EOR3($acc_m, $acc_m, $t3, $t6) // GHASH block 4k+1 - mid & GHASH block 4k+2 - mid |
| aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 7 |
| pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high |
| eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid |
| PLATFORM_EOR3($acc_h, $acc_h, $rk10_12, $t7) // GHASH block 4k+2 - high & GHASH block 4k+3 - high |
| aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 8 |
| aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 8 |
| aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 8 |
| pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid |
| ldr $mod_constantd, [sp, #$mod_constant_sp_offset] |
| aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 8 |
| pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid |
| PLATFORM_EOR3($acc_m, $acc_m, $t9, $mod_t) // GHASH block 4k+3 - mid & MODULO - fold into mid |
| eor $acc_l.16b, $acc_l.16b, $rk9_11_tmp.16b // GHASH block 4k+3 - low |
| eor $t9.16b, $acc_l.16b, $acc_h.16b // MODULO - karatsuba tidy up |
| ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8 // MODULO - other top alignment |
| PLATFORM_EOR3($acc_m, $acc_m, $t9, $acc_h) // MODULO - karatsuba tidy up & MODULO - fold into mid |
| pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low |
| ext $acc_m.16b, $acc_m.16b, $acc_m.16b, #8 // MODULO - other mid alignment |
| PLATFORM_EOR3($acc_l, $acc_l, $mod_constant, $acc_m) // MODULO - fold into low |
| cmp $rounds_w, #12 // setup flags for AES-128/192/256 check |
| b.lt .Ldec_main_loop_continue // branch if AES-128 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 9 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 9 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 9 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 9 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 10 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 10 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 10 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 10 |
| b.eq .Ldec_main_loop_continue // branch if AES-192 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 11 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 11 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 11 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 11 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 12 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 12 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 12 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 12 |
| .Ldec_main_loop_continue: |
| ldr $rk9_11_tmpq, [$input_l1] // load rkN |
| ldp $res2q, $res3q, [$input_ptr, #32] // AES blocks 2,3 load ciphertext |
| ldp $res0q, $res1q, [$input_ptr], #64 // AES blocks 0,1 load ciphertext |
| aese $ctr0.16b, $rkNm1.16b // AES block 4k+4 - round N-1 |
| PLATFORM_EOR3($ctr0, $res0, $ctr0, $rk9_11_tmp) // AES block 4k+4 - result |
| aese $ctr1.16b, $rkNm1.16b // AES block 4k+5 - round N-1 |
| PLATFORM_EOR3($ctr1, $res1, $ctr1, $rk9_11_tmp) // AES block 4k+5 - result |
| aese $ctr2.16b, $rkNm1.16b // AES block 4k+6 - round N-1 |
| PLATFORM_EOR3($ctr2, $res2, $ctr2, $rk9_11_tmp) // AES block 4k+6 - result |
| aese $ctr3.16b, $rkNm1.16b // AES block 4k+7 - round N-1 |
| PLATFORM_EOR3($ctr3, $res3, $ctr3, $rk9_11_tmp) // AES block 4k+7 - result |
| st1 { $ctr0.16b, $ctr1.16b, $ctr2.16b, $ctr3.16b}, [$output_ptr], #64 // AES blocks 4k+4-7 - store result |
| ldr $ctr0q, [sp, #$ctr0_sp_offset] |
| ldr $ctr1q, [sp, #$ctr1_sp_offset] |
| ldr $ctr2q, [sp, #$ctr2_sp_offset] |
| ldr $ctr3q, [sp, #$ctr3_sp_offset] |
| rev $tmp_gpr_w, $ctr32w |
| str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $ctr32w, #1 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $ctr32w, #2 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}] |
| add $tmp_gpr_w, $ctr32w, #3 |
| rev $tmp_gpr_w, $tmp_gpr_w |
| str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}] |
| add $ctr32w, $ctr32w, #4 |
| cmp $input_ptr, $main_end_input_ptr |
| b.lt .Ldec_main_loop |
| .Ldec_prepretail: // PREPRETAIL |
| rev64 $res0.16b, $res0.16b // GHASH block 0 |
| rev64 $res1.16b, $res1.16b // GHASH block 1 |
| ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // PRE 0 |
| aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 0 |
| aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 0 |
| eor $res0.16b, $res0.16b, $acc_l.16b // PRE 1 |
| rev64 $res2.16b, $res2.16b // GHASH block 2 |
| aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 1 |
| pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 0 - low |
| mov $t0d, $res0.d[1] // GHASH block 0 - mid |
| pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 0 - high |
| aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 0 |
| mov $acc_md, $h34k.d[1] // GHASH block 0 - mid |
| aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 1 |
| eor $t0.8b, $t0.8b, $res0.8b // GHASH block 0 - mid |
| pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 1 - high |
| aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 1 |
| rev64 $res3.16b, $res3.16b // GHASH block 3 |
| aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 0 |
| pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 0 - mid |
| eor $acc_h.16b, $acc_h.16b, $t1.16b // GHASH block 1 - high |
| pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 1 - low |
| aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 1 |
| mov $t3d, $res1.d[1] // GHASH block 1 - mid |
| aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 2 |
| aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 2 |
| eor $acc_l.16b, $acc_l.16b, $t2.16b // GHASH block 1 - low |
| aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 2 |
| aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 3 |
| mov $t6d, $res2.d[1] // GHASH block 2 - mid |
| aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 2 |
| eor $t3.8b, $t3.8b, $res1.8b // GHASH block 1 - mid |
| pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 2 - low |
| aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 4 |
| aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 3 |
| eor $t6.8b, $t6.8b, $res2.8b // GHASH block 2 - mid |
| pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 1 - mid |
| aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 5 |
| eor $acc_l.16b, $acc_l.16b, $t5.16b // GHASH block 2 - low |
| aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 4 |
| pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 3 - high |
| eor $acc_m.16b, $acc_m.16b, $t3.16b // GHASH block 1 - mid |
| pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 2 - high |
| aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 5 |
| ins $t6.d[1], $t6.d[0] // GHASH block 2 - mid |
| aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 3 |
| aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 3 |
| PLATFORM_EOR3($acc_h, $acc_h, $t4, $t7) // GHASH block 2 - high & GHASH block 3 - high |
| pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 3 - low |
| aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 4 |
| mov $t9d, $res3.d[1] // GHASH block 3 - mid |
| aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 4 |
| pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 2 - mid |
| aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 5 |
| eor $t9.8b, $t9.8b, $res3.8b // GHASH block 3 - mid |
| aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 5 |
| aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 6 |
| eor $acc_m.16b, $acc_m.16b, $t6.16b // GHASH block 2 - mid |
| aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 6 |
| aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 6 |
| movi $mod_constant.8b, #0xc2 |
| aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 6 |
| eor $acc_l.16b, $acc_l.16b, $t8.16b // GHASH block 3 - low |
| pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 3 - mid |
| aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 7 |
| aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 7 |
| aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 7 |
| eor $acc_m.16b, $acc_m.16b, $t9.16b // GHASH block 3 - mid |
| aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 8 |
| aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 7 |
| eor $t9.16b, $acc_l.16b, $acc_h.16b // MODULO - karatsuba tidy up |
| aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 8 |
| aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 8 |
| shl $mod_constantd, $mod_constantd, #56 // mod_constant |
| aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 8 |
| cmp $rounds_w, #12 // setup flags for AES-128/192/256 check |
| b.lt .Ldec_finish_prepretail // branch if AES-128 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 9 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 9 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 9 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 9 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 10 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 10 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 10 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 10 |
| b.eq .Ldec_finish_prepretail // branch if AES-192 |
| ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12 |
| aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 11 |
| aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 11 |
| aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 11 |
| aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 11 |
| aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 12 |
| aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 12 |
| aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 12 |
| aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 12 |
| .Ldec_finish_prepretail: |
| eor $acc_m.16b, $acc_m.16b, $t9.16b // MODULO - karatsuba tidy up |
| pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid |
| ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8 // MODULO - other top alignment |
| PLATFORM_EOR3($acc_m, $acc_m, $mod_t, $acc_h) // MODULO - fold into mid |
| pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low |
| ext $acc_m.16b, $acc_m.16b, $acc_m.16b, #8 // MODULO - other mid alignment |
| PLATFORM_EOR3($acc_l, $acc_l, $mod_constant, $acc_m) // MODULO - fold into low |
| aese $ctr1.16b, $rkNm1.16b // AES block 1 - round N-1 |
| aese $ctr0.16b, $rkNm1.16b // AES block 0 - round N-1 |
| aese $ctr3.16b, $rkNm1.16b // AES block 3 - round N-1 |
| aese $ctr2.16b, $rkNm1.16b // AES block 2 - round N-1 |
| .Ldec_tail: // TAIL |
| sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process |
| ld1 { $res1.16b}, [$input_ptr], #16 // AES block 0 - load ciphertext |
| eor $ctr0.16b, $res1.16b, $ctr0.16b // AES block 0 - result |
| mov $output_l0, $ctr0.d[0] // AES block 0 - mov low |
| mov $output_h0, $ctr0.d[1] // AES block 0 - mov high |
| ext $t0.16b, $acc_l.16b, $acc_l.16b, #8 // prepare final partial tag |
| eor $output_l0, $output_l0, $rkN_l // AES block 0 - round N low |
| eor $output_h0, $output_h0, $rkN_h // AES block 0 - round N high |
| cmp $main_end_input_ptr, #48 |
| b.gt .Ldec_blocks_more_than_3 |
| mov $ctr3.16b, $ctr2.16b |
| movi $acc_m.8b, #0 |
| movi $acc_l.8b, #0 |
| movi $acc_h.8b, #0 |
| mov $ctr2.16b, $ctr1.16b |
| cmp $main_end_input_ptr, #32 |
| b.gt .Ldec_blocks_more_than_2 |
| mov $ctr3.16b, $ctr1.16b |
| cmp $main_end_input_ptr, #16 |
| b.gt .Ldec_blocks_more_than_1 |
| b .Ldec_blocks_less_than_1 |
| .Ldec_blocks_more_than_3: // blocks left > 3 |
| rev64 $res0.16b, $res1.16b // GHASH final-3 block |
| ld1 { $res1.16b}, [$input_ptr], #16 // AES final-2 block - load ciphertext |
| stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-3 block - store result |
| mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid |
| eor $res0.16b, $res0.16b, $t0.16b // feed in partial tag |
| eor $ctr0.16b, $res1.16b, $ctr1.16b // AES final-2 block - result |
| mov $rk4d, $res0.d[1] // GHASH final-3 block - mid |
| mov $output_l0, $ctr0.d[0] // AES final-2 block - mov low |
| mov $output_h0, $ctr0.d[1] // AES final-2 block - mov high |
| eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-3 block - mid |
| movi $t0.8b, #0 // suppress further partial tag feed in |
| pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high |
| pmull $acc_m.1q, $rk4v.1d, $acc_m.1d // GHASH final-3 block - mid |
| eor $output_l0, $output_l0, $rkN_l // AES final-2 block - round N low |
| pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low |
| eor $output_h0, $output_h0, $rkN_h // AES final-2 block - round N high |
| .Ldec_blocks_more_than_2: // blocks left > 2 |
| rev64 $res0.16b, $res1.16b // GHASH final-2 block |
| ld1 { $res1.16b}, [$input_ptr], #16 // AES final-1 block - load ciphertext |
| eor $res0.16b, $res0.16b, $t0.16b // feed in partial tag |
| stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-2 block - store result |
| eor $ctr0.16b, $res1.16b, $ctr2.16b // AES final-1 block - result |
| mov $rk4d, $res0.d[1] // GHASH final-2 block - mid |
| pmull $rk3q1, $res0.1d, $h3.1d // GHASH final-2 block - low |
| pmull2 $rk2.1q, $res0.2d, $h3.2d // GHASH final-2 block - high |
| eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-2 block - mid |
| mov $output_l0, $ctr0.d[0] // AES final-1 block - mov low |
| mov $output_h0, $ctr0.d[1] // AES final-1 block - mov high |
| eor $acc_l.16b, $acc_l.16b, $rk3.16b // GHASH final-2 block - low |
| movi $t0.8b, #0 // suppress further partial tag feed in |
| pmull $rk4v.1q, $rk4v.1d, $h34k.1d // GHASH final-2 block - mid |
| eor $acc_h.16b, $acc_h.16b, $rk2.16b // GHASH final-2 block - high |
| eor $output_l0, $output_l0, $rkN_l // AES final-1 block - round N low |
| eor $acc_m.16b, $acc_m.16b, $rk4v.16b // GHASH final-2 block - mid |
| eor $output_h0, $output_h0, $rkN_h // AES final-1 block - round N high |
| .Ldec_blocks_more_than_1: // blocks left > 1 |
| stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-1 block - store result |
| rev64 $res0.16b, $res1.16b // GHASH final-1 block |
| ld1 { $res1.16b}, [$input_ptr], #16 // AES final block - load ciphertext |
| eor $res0.16b, $res0.16b, $t0.16b // feed in partial tag |
| movi $t0.8b, #0 // suppress further partial tag feed in |
| mov $rk4d, $res0.d[1] // GHASH final-1 block - mid |
| eor $ctr0.16b, $res1.16b, $ctr3.16b // AES final block - result |
| pmull2 $rk2q1, $res0.2d, $h2.2d // GHASH final-1 block - high |
| eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-1 block - mid |
| pmull $rk3q1, $res0.1d, $h2.1d // GHASH final-1 block - low |
| mov $output_l0, $ctr0.d[0] // AES final block - mov low |
| ins $rk4v.d[1], $rk4v.d[0] // GHASH final-1 block - mid |
| mov $output_h0, $ctr0.d[1] // AES final block - mov high |
| pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d // GHASH final-1 block - mid |
| eor $output_l0, $output_l0, $rkN_l // AES final block - round N low |
| eor $acc_l.16b, $acc_l.16b, $rk3.16b // GHASH final-1 block - low |
| eor $acc_h.16b, $acc_h.16b, $rk2.16b // GHASH final-1 block - high |
| eor $acc_m.16b, $acc_m.16b, $rk4v.16b // GHASH final-1 block - mid |
| eor $output_h0, $output_h0, $rkN_h // AES final block - round N high |
| .Ldec_blocks_less_than_1: // blocks left <= 1 |
| add $T32, $T32, $bit_length, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed |
| rev $W32, $W32 |
| str $W32, [$counter, #12] // store the updated counter |
| and $bit_length, $bit_length, #127 // bit_length %= 128 |
| mvn $rkN_h, xzr // rkN_h = 0xffffffffffffffff |
| sub $bit_length, $bit_length, #128 // bit_length -= 128 |
| mvn $rkN_l, xzr // rkN_l = 0xffffffffffffffff |
| ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] // load existing bytes we need to not overwrite |
| neg $bit_length, $bit_length // bit_length = 128 - #bits in input (in range [1,128]) |
| and $bit_length, $bit_length, #127 // bit_length %= 128 |
| lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block |
| cmp $bit_length, #64 |
| csel $ctr32x, $rkN_l, $rkN_h, lt |
| csel $ctr96_b64x, $rkN_h, xzr, lt |
| fmov $ctr0d, $ctr32x // ctr0b is mask for last block |
| and $output_l0, $output_l0, $ctr32x |
| mov $ctr0.d[1], $ctr96_b64x |
| bic $end_input_ptr, $end_input_ptr, $ctr32x // mask out low existing bytes |
| bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x // mask out high existing bytes |
| orr $output_l0, $output_l0, $end_input_ptr |
| and $output_h0, $output_h0, $ctr96_b64x |
| orr $output_h0, $output_h0, $main_end_input_ptr |
| and $res1.16b, $res1.16b, $ctr0.16b // possibly partial last block has zeroes in highest bits |
| rev64 $res0.16b, $res1.16b // GHASH final block |
| eor $res0.16b, $res0.16b, $t0.16b // feed in partial tag |
| pmull $rk3q1, $res0.1d, $h1.1d // GHASH final block - low |
| mov $t0d, $res0.d[1] // GHASH final block - mid |
| eor $t0.8b, $t0.8b, $res0.8b // GHASH final block - mid |
| pmull2 $rk2q1, $res0.2d, $h1.2d // GHASH final block - high |
| pmull $t0.1q, $t0.1d, $h12k.1d // GHASH final block - mid |
| eor $acc_h.16b, $acc_h.16b, $rk2.16b // GHASH final block - high |
| eor $acc_l.16b, $acc_l.16b, $rk3.16b // GHASH final block - low |
| eor $acc_m.16b, $acc_m.16b, $t0.16b // GHASH final block - mid |
| ldr $mod_constantd, [sp, #$mod_constant_sp_offset] |
| eor $t9.16b, $acc_l.16b, $acc_h.16b // MODULO - karatsuba tidy up |
| eor $acc_m.16b, $acc_m.16b, $t9.16b // MODULO - karatsuba tidy up |
| pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid |
| ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8 // MODULO - other top alignment |
| eor $acc_m.16b, $acc_m.16b, $mod_t.16b // MODULO - fold into mid |
| eor $acc_m.16b, $acc_m.16b, $acc_h.16b // MODULO - fold into mid |
| pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low |
| ext $acc_m.16b, $acc_m.16b, $acc_m.16b, #8 // MODULO - other mid alignment |
| eor $acc_l.16b, $acc_l.16b, $mod_constant.16b // MODULO - fold into low |
| stp $output_l0, $output_h0, [$output_ptr] |
| eor $acc_l.16b, $acc_l.16b, $acc_m.16b // MODULO - fold into low |
| ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 |
| rev64 $acc_l.16b, $acc_l.16b // Final Tag |
| mov x0, $len |
| st1 { $acc_l.16b }, [$current_tag] // Store final tag |
| ldp x19, x20, [sp, #16] |
| ldp x21, x22, [sp, #32] |
| ldp x23, x24, [sp, #48] |
| ldp d8, d9, [sp, #64] |
| ldp d10, d11, [sp, #80] |
| ldp d12, d13, [sp, #96] |
| ldp d14, d15, [sp, #112] |
| ldp x29, x30, [sp], #224 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| .size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel |
| ___ |
| # 1. Print directives |
| print $header_directives; |
| |
| # 2. Print Legacy implementation |
| print get_transformed_code(0, $code_template); |
| |
| print "\n"; |
| |
| # 3. Print EOR3 implementation |
| print get_transformed_code(1, $code_template); |
| |
| print <<'___'; |
| #endif // __ARM_MAX_ARCH__ >= 8 |
| ___ |
| |
| # Close the handle (which flushes to arm-xlate.pl) |
| close STDOUT or die "error closing STDOUT: $!"; |
| } |
| } |