blob: 046c43ca1332ca9539f3e8d0a20351624f25cbad [file]
#! /usr/bin/env perl
# Copyright (c) 2022, ARM Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#========================================================================
# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
# derived from https://github.com/ARM-software/AArch64cryptolib, original
# author Samuel Lee <Samuel.Lee@arm.com>.
#========================================================================
#
# Approach - assume we don't want to reload constants, so reserve ~half of
# vector register file for constants
#
# main loop to act on 4 16B blocks per iteration, and then do modulo of the
# accumulated intermediate hashes from the 4 blocks
#
# ____________________________________________________
# | |
# | PRE |
# |____________________________________________________|
# | | | |
# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
# |________________|________________|__________________|
# | | | |
# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
# |________________|________________|__________________|
# | | | |
# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
# |________________|________________|__________________|
# | | | |
# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
# |________________|____(mostly)____|__________________|
# | |
# | MODULO |
# |____________________________________________________|
#
# PRE: Ensure previous generated intermediate hash is aligned and merged with
# result for GHASH 4k+0
#
# EXT low_acc, low_acc, low_acc, #8
# EOR res_curr (4k+0), res_curr (4k+0), low_acc
#
# CTR block: Increment and byte reverse counter in scalar registers and transfer
# to SIMD registers
#
# REV ctr32, rev_ctr32
# ORR ctr64, constctr96_top32, ctr32, LSL #32
# // Keeping this in scalar registers to free up space in SIMD RF
# INS ctr_next.d[0], constctr96_bottom64
# INS ctr_next.d[1], ctr64X
# ADD rev_ctr32, #1
#
# AES block:
#
# Do AES encryption/decryption on CTR block X and EOR it with input block X.
# Take 256 bytes key below for example. Doing small trick here of loading input
# in scalar registers, EORing with last key and then transferring Given we are
# very constrained in our ASIMD registers this is quite important
#
# Encrypt:
# LDR input_low, [ input_ptr ], #8
# LDR input_high, [ input_ptr ], #8
# EOR input_low, k14_low
# EOR input_high, k14_high
# INS res_curr.d[0], input_low
# INS res_curr.d[1], input_high
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k13
# EOR res_curr, res_curr, ctr_curr
# ST1 { res_curr.16b }, [ output_ptr ], #16
#
# Decrypt:
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
# AESE ctr_curr, k13
# LDR res_curr, [ input_ptr ], #16
# EOR res_curr, res_curr, ctr_curr
# MOV output_low, res_curr.d[0]
# MOV output_high, res_curr.d[1]
# EOR output_low, k14_low
# EOR output_high, k14_high
# STP output_low, output_high, [ output_ptr ], #16
#
# GHASH block X:
# Do 128b karatsuba polynomial multiplication on block. We only have
# 64b->128b polynomial multipliers, naively that means we need to do 4 64b
# multiplies to generate a 128b.
#
# multiplication:
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^
# (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
#
# The idea behind Karatsuba multiplication is that we can do just 3 64b
# multiplies:
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^
# (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^
# Pmull(Al,Bl))<<64
#
# There is some complication here because the bit order of GHASH's PMULL is
# reversed compared to elsewhere, so we are multiplying with "twisted"
# powers of H
#
# Note: We can PMULL directly into the acc_x in first GHASH of the loop
#
# Note: For scheduling big cores we want to split the processing to happen over
# two loop iterations - otherwise the critical path latency dominates the
# performance.
#
# This has a knock on effect on register pressure, so we have to be a bit
# more clever with our temporary registers than indicated here
#
# REV64 res_curr, res_curr
# INS t_m.d[0], res_curr.d[1]
# EOR t_m.8B, t_m.8B, res_curr.8B
# PMULL2 t_h, res_curr, HX
# PMULL t_l, res_curr, HX
# PMULL t_m, t_m, HX_k
# EOR acc_h, acc_h, t_h
# EOR acc_l, acc_l, t_l
# EOR acc_m, acc_m, t_m
#
# MODULO: take the partial accumulators (~representing sum of 256b
# multiplication results), from GHASH and do modulo reduction on them
# There is some complication here because the bit order of GHASH's
# PMULL is reversed compared to elsewhere, so we are doing modulo with
# a reversed constant
#
# EOR acc_m, acc_m, acc_h
# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
# PMULL t_mod, acc_h, mod_constant
# EXT acc_h, acc_h, acc_h, #8
# EOR acc_m, acc_m, acc_h
# EOR acc_m, acc_m, t_mod
# PMULL acc_h, acc_m, mod_constant
# EXT acc_m, acc_m, acc_m, #8
# EOR acc_l, acc_l, acc_h
# EOR acc_l, acc_l, acc_m
#
# Key Optimizations:
#
# NOTE: This implementation is heavily NEON-bound due to the large amount of
# vector computation. The original code from which this was derived often
# avoided unnecessary Load/Store operations, but at the expense of
# increased NEON pipeline pressure and more GPR <-> NEON data movements.
# The primary goal of the optimizations applied here is to reduce NEON
# pipeline pressure. This includes minimizing NEON register-to-register moves,
# GPR to NEON transfers (and vice versa), and other operations that consume
# execution slots on the NEON pipes.
#
# 1. Merged Kernels: AES-128, AES-192, and AES-256 GCM encryption and
# decryption are handled by single kernels each, reducing code size.
# Conditional branching is used for the key-size specific rounds.
#
# 2. Aggressive Pipelining: GHASH and AES operations for different blocks
# (4k to 4k+3) are heavily interleaved within the main loop. GHASH for
# block `n` starts while AES for block `n` is still in progress, and
# AES for block `n+4` begins. This hides instruction latencies.
#
# 3. Optimized Counter Handling: Instead of incrementing, reversing, and
# moving the counter to NEON registers for each of the four blocks in
# every loop iteration, we precompute and cache counter values on the
# stack.
# - The lower 96 bits of the counter are constant across the four blocks
# within an iteration.
# - The upper 32 bits (the byte-swapped incrementing part) are calculated
# for all four blocks (N, N+1, N+2, N+3) and stored on the stack at
# the start of the loop.
# - Values are calculated one loop iteration ahead of time. That is,
# immediately after loading values N-N+3 we calculate and store values
# N+4-N+7. This ensures that the data is available when loaded without
# store-load forwarding.
# - This strategy eliminates repeated scalar-to-vector transfers (FMOV)
# and scalar increment/reverse operations inside the tight loop.
#
# 4. Reduced Scalar-NEON Transfers:
# - For encryption, the final XOR with the last round key (rkN) and the
# AES-encrypted counter is done entirely in NEON registers using
# EOR or PLATFORM_EOR3, avoiding costly FMOV instructions to move
# plaintext data from GPRs to NEON after partial operations. A similar
# optimization is used for decryption.
#
# 5. EOR3 Optimization: The PLATFORM_EOR3 macro allows using the single
# EOR3 instruction (if available) to replace two EOR instructions,
# i.e., D = A ^ B ^ C. This is used in both GHASH accumulator updates
# and the final AES output XOR. Both versions of encrypt and decrypt are
# stored in the same assembly file.
#
# 6. Instruction scheduling was significantly modified both to directly improve
# performance as well as to remove false dependencies when that allowed
# additional improvements. There is still a bit of additional speedup
# possible but would be easiest to extract with platform-specific
# implementations.
#
use strict;
use warnings;
my $flavour = shift;
my $output = shift;
if (!defined $flavour) {
die "Usage: $0 <flavour> [output_filename]\n";
}
$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1;
my $xlate;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT, "|-", $^X, $xlate, $flavour, $output;
*STDOUT=*OUT;
# Converts an EOR3 pseudo-instruction into either the raw EOR3 encoding or into
# two EOR2 instructions.
sub process_eor3_match {
my ($vd_str, $vn_str, $vm_str, $va_str, $is_eor3) = @_;
if ($is_eor3) {
# Directly emit the EOR3 mnemonic.
# Requires: .arch armv8.2-a+sha3 OR -march=armv8.2-a+sha3
return sprintf(" eor3 %s.16b, %s.16b, %s.16b, %s.16b",
$vd_str, $vn_str, $vm_str, $va_str);
} else {
# Fallback: Two standard NEON EOR instructions
# Result = (Vn ^ Vm) ^ Va
# 1. Vd = Vn ^ Vm
# 2. Vd = Vd ^ Va
return sprintf(" eor %s.16b, %s.16b, %s.16b\n" .
" eor %s.16b, %s.16b, %s.16b",
$vd_str, $vn_str, $vm_str, # Step 1
$vd_str, $vd_str, $va_str); # Step 2
}
}
# Generates the code string for a specific version (Standard or EOR3).
# Performs regex substitutions for EOR3 instructions and symbol renaming.
sub get_transformed_code {
my ($is_eor3, $template) = @_;
my $temp_code = $template;
if ($is_eor3) {
# Suffix function names
$temp_code =~ s/\b(aes_gcm_enc_kernel)\b/${1}_eor3/g;
$temp_code =~ s/\b(aes_gcm_dec_kernel)\b/${1}_eor3/g;
# Suffix local labels
$temp_code =~ s/(\.L[a-zA-Z0-9_]+)/$1_eor3/g;
}
$temp_code =~ s/PLATFORM_EOR3\s*\(\s*([vV0-9]+)\s*,\s*([vV0-9]+)\s*,\s*([vV0-9]+)\s*,\s*([vV0-9]+)\s*\)/
process_eor3_match($1, $2, $3, $4, $is_eor3)/ge;
return $temp_code;
}
{
{
# We attempt to minimize the number of aliases for the same register but
# unfortunately sometimes you have to specify a Q or D register.
## AArch64 General-Purpose Registers (GPRs)
my $input_ptr = "x0"; # IN: Pointer to input plaintext
my $bit_length = "x1"; # IN: Length of input in bits
my $output_ptr = "x2"; # IN: Pointer to output ciphertext
my $current_tag = "x3"; # IN: Pointer to 16-byte GCM tag (T)
my $end_input_ptr = "x4"; # Calculated end of all input
my $main_end_input_ptr = "x5"; # Calculated end of the main 4-block loop
my $Htable = "x6"; # Pointer to GHASH key table (H)
my $key_ptr = "x7"; # Pointer to end of AES key schedule
my $cc = "x8"; # Pointer to AES key schedule (context)
# Re-purposed registers for tail/scalar handling
my $input_l0 = "x6";
my $input_h0 = "x7";
my $T32 = "x10"; # Holds scalar 64-bit counter value
my $W32 = "w10"; # Holds scalar 32-bit counter value
my $W_T32 = "w12"; # Holds reversed scalar 32-bit counter
my $rkN_l = "x13"; # Holds low 64 bits of final round key
my $rkN_h = "x14"; # Holds high 64 bits of final round key
my $len = "x15"; # Byte length of input
my $counter = "x16"; # Holds pointer to the 16-byte counter block (CTR)
my $rounds = "x17"; # Number of AES rounds (scalar)
my $rounds_w = "w17"; # Number of AES rounds (scalar)
my $tmp_gpr_w = "w20"; # Scratch register (word)
my $mod_constantx = "x21"; # Holds GHASH modulus 0xc2... (scalar)
## NEON/SIMD Registers (v0-v31)
my ($ctr0, $ctr1, $ctr2, $ctr3) = map("v$_", (0..3));
my ($ctr0q, $ctr1q, $ctr2q, $ctr3q) = map("q$_", (0..3));
my $ctr0d = "d0";
my ($res0, $res1, $res2, $res3) = map("v$_", (4..7));
my ($res0q, $res1q, $res2q, $res3q) = map("q$_", (4..7));
my ($res0d, $res1d, $res2d, $res3d) = map("d$_", (4..7));
my ($acc_h, $acc_m, $acc_l) = map("v$_", (9..11));
my ($h1, $h2, $h3, $h4) = map("v$_", (12..15));
my ($h1q, $h2q, $h3q, $h4q) = map("q$_", (12..15));
my ($h12k, $h34k) = map("v$_", (16..17)); # H[1]^H[2] and H[3]^H[4]
my ($rk0, $rk1, $rk2, $rk3, $rk4, $rk5, $rk6, $rk7, $rk8) = map("v$_", (18..26));
my ($rk0q, $rk1q, $rk2q, $rk3q, $rk4q, $rk5q, $rk6q, $rk7q, $rk8q) = map("q$_", (18..26));
my $rk9_11_tmp = "v27"; # Used for rk9 (if AES-192) and rk11 (if AES-256)
my $rk10_12 = "v28"; # Used for rk10 (if AES-192) and rk12 (if AES-256)
my $rk9_11_tmpq = "q27";
my $rk10_12q = "q28";
my $rkNm1 = "v31"; # Round N-1 key
my $rkNm1q = "q31";
my $mod_constant = "v8";
my $mod_constantd = "d8";
my $final_block_dest = "v18";
my $final_block_destq = "q18";
my ($ghash_t0, $ghash_t1, $ghash_t2, $ghash_t3) = map("v$_", (20..23));
my ($ghash_t0d, $ghash_t1d, $ghash_t2d, $ghash_t3d) = map("d$_", (20..23));
my $v_rkN = "v30";
my $v_rkNq = "q30";
# Locations on the stack
my $mod_constant_sp_offset = 128;
my $ctr0_sp_offset = 160;
my $ctr1_sp_offset = 176;
my $ctr2_sp_offset = 192;
my $ctr3_sp_offset = 208;
# Registers specific to dec_kernel
my ($input_l1, $input_h1) = map("x$_", (19..20));
my $ctr32w = "w9";
my ($ctr32x, $ctr96_b64x) = map("x$_", (9..10));
my $acc_md = "d10";
# Decryption temporary NEON registers
my $t0 = "v8"; my $t0d = "d8";
my $t1 = "v4"; my $t1d = "d4";
my $t2 = "v8"; my $t2d = "d8";
my $t3 = "v4"; my $t3d = "d4";
my $t4 = "v4"; my $t4d = "d4";
my $t5 = "v5"; my $t5d = "d5";
my $t6 = "v8"; my $t6d = "d8";
my $t7 = "v5"; my $t7d = "d5";
my $t8 = "v4"; my $t8d = "d8";
my $t9 = "v6"; my $t9d = "d6";
my ($ctr_t0, $ctr_t1, $ctr_t2, $ctr_t3) = map("v$_", (4..7));
my $mod_t = "v7";
# Decryption key registers
my $rk2q1 = "v20.1q";
my $rk3q1 = "v21.1q";
my $rk4v = "v22";
my $rk4d = "d22";
my ($output_l1, $output_h1, $output_l2, $output_h2, $output_l3, $output_h3) = map("x$_", (19..24));
my ($output_l0, $output_h0) = map("x$_", (6..7));
# --- Assemble the entire template ---
my $code_template = "";
my $header_directives = <<'___';
#if __ARM_MAX_ARCH__ >= 8
.arch armv8.2-a+crypto+sha3
.text
___
$code_template .= <<"___";
.global aes_gcm_enc_kernel
.type aes_gcm_enc_kernel,%function
.align 4
aes_gcm_enc_kernel:
AARCH64_SIGN_LINK_REGISTER
stp x29, x30, [sp, #-224]!
mov x29, sp
ld1 { $ctr0.16b}, [x4] // Load initial counter block
stp x19, x20, [sp, #16]
mov $ctr1.16b, $ctr0.16b // Initialize ctr1-3 from ctr0
mov $ctr2.16b, $ctr0.16b
mov $ctr3.16b, $ctr0.16b
mov $counter, x4 // Pointer to counter block in memory
mov $cc, x5 // Pointer to AES key schedule context
stp $mod_constantx, x22, [sp, #32]
// [sp, #48] is unused but allocated to align the stack layout with aes_gcm_dec_kernel
stp d8, d9, [sp, #64] // Save Neon registers
stp d10, d11, [sp, #80]
stp d12, d13, [sp, #96]
stp d14, d15, [sp, #112]
ldr $rounds_w, [$cc, #240] // Load number of AES rounds
add $key_ptr, $cc, $rounds, lsl #4 // Calculate pointer to the last round key
ldp $rkN_l, $rkN_h, [$key_ptr] // load round N key (for final XOR)
ldr $rkNm1q, [$key_ptr, #-16] // load round N-1 key
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // Calculate end of input
lsr $main_end_input_ptr, $bit_length, #3 // Total byte length
mov $len, $main_end_input_ptr
ldr $W_T32, [$counter, #12] // Load counter's low 32 bits
sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1
ldr $rk0q, [$cc, #0] // load rk0
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // Align main loop end to a multiple of 64 bytes
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
rev $W_T32, $W_T32 // Reverse for big-endian increment
uxtw $T32, $W_T32 // Zero extend reversed w12 into x10 for final counter update
// Pre-compute this value instead of using two instructions to reconstruct it every iteration
mov $mod_constantx, #0xc200000000000000 // GHASH reduction constant
str $mod_constantx, [sp, #$mod_constant_sp_offset]
// We maintain four copies of ctr values on the stack. Each loop iteration we
// store the updated ctr value to the last four bytes (e.g., $ctr0_sp_offset + 12).
// We then load the four values. This avoids a singificant number of
// expensive GPR->NEON and NEON->NEON moves. To avoid LDST forwarding we
// calculate and store the values one iteration ahead so they have time to
// drain before we load them.
str $ctr0q, [sp, #$ctr0_sp_offset] // Store base counter for block 0-3
str $ctr0q, [sp, #$ctr1_sp_offset]
str $ctr0q, [sp, #$ctr2_sp_offset]
str $ctr0q, [sp, #$ctr3_sp_offset]
// Since we need the values right away don't go through the stack this first
// time. Manually insert the incremented big-endian counter values.
rev $tmp_gpr_w, $W_T32
mov $ctr0.s[3], $tmp_gpr_w // ctr0 + 0
add $tmp_gpr_w, $W_T32, #1
rev $tmp_gpr_w, $tmp_gpr_w
mov $ctr1.s[3], $tmp_gpr_w // ctr0 + 1
add $tmp_gpr_w, $W_T32, #2
rev $tmp_gpr_w, $tmp_gpr_w
mov $ctr2.s[3], $tmp_gpr_w // ctr0 + 2
add $tmp_gpr_w, $W_T32, #3
rev $tmp_gpr_w, $tmp_gpr_w
mov $ctr3.s[3], $tmp_gpr_w // ctr0 + 3
// Calculate the ctr values for the *next* (not current) group of four
// blocks. Store the incremented parts to the stack.
add $tmp_gpr_w, $W_T32, #4
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}] // ctr0 + 4 for next iter
add $tmp_gpr_w, $W_T32, #5
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}] // ctr0 + 5 for next iter
add $tmp_gpr_w, $W_T32, #6
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}] // ctr0 + 6 for next iter
add $tmp_gpr_w, $W_T32, #7
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}] // ctr0 + 7 for next iter
add $W_T32, $W_T32, #8 // Advance counter past these two sets
// --- Start AES for first 4 blocks ---
aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 0
ldp $rk1q, $rk2q, [$cc, #16] // load rk1, rk2
aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 0
ldp $rk3q, $rk4q, [$cc, #48] // load rk3, rk4
aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 0
ldp $rk5q, $rk6q, [$cc, #80] // load rk5, rk6
aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 0
ldp $h2q, $h3q, [$Htable, #32] // load H2, H3 (GHASH keys)
aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 1
ldp $rk7q, $rk8q, [$cc, #112] // load rk7, rk8
aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 1
aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 1
aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 1
aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 2
ext $h3.16b, $h3.16b, $h3.16b, #8 // Byte swap H3 for GHASH
aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 2
ext $h2.16b, $h2.16b, $h2.16b, #8 // Byte swap H2 for GHASH
aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 2
ldr $h4q, [$Htable, #80] // load H4
aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 2
ext $h4.16b, $h4.16b, $h4.16b, #8 // Byte swap H4 for GHASH
aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 3
ld1 { $acc_l.16b}, [$current_tag] // Load initial GHASH accumulator (T)
aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 3
ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // Byte swap T for GHASH
aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 3
rev64 $acc_l.16b, $acc_l.16b // Correct byte order within 64-bit lanes
aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 3
trn2 $h34k.2d, $h3.2d, $h4.2d // Karatsuba key: H4_low | H3_low
aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 4
ldr $h1q, [$Htable] // load H1
aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 4
ext $h1.16b, $h1.16b, $h1.16b, #8 // Byte swap H1 for GHASH
aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 4
trn1 $acc_h.2d, $h3.2d, $h4.2d // Karatsuba key: H4_high | H3_high
aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 4
trn2 $h12k.2d, $h1.2d, $h2.2d // Karatsuba key: H2_low | H1_low
aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 5
ldr $v_rkNq, [$key_ptr] // Preload round N key for final EOR
aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 5
aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 5
aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 5
aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 6
aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 6
aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 6
aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 6
aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 7
aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 7
aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 7
aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 7
aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 8
aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 8
aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 8
aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 8
cmp $rounds, #12 // setup flags for AES-128/192/256 check
b.lt .Lenc_finish_first_blocks // branch if AES-128
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 9
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 9
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 9
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 9
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 10
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 10
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 10
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 10
b.eq .Lenc_finish_first_blocks // branch if AES-192
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 11
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 11
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 11
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 11
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 12
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 12
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 12
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 12
.Lenc_finish_first_blocks:
cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks to process in the tail
eor $h34k.16b, $h34k.16b, $acc_h.16b // Karatsuba key: H3^H4
aese $ctr0.16b, $rkNm1.16b // AES block 0 - round N-1
aese $ctr1.16b, $rkNm1.16b // AES block 1 - round N-1
aese $ctr2.16b, $rkNm1.16b // AES block 2 - round N-1
aese $ctr3.16b, $rkNm1.16b // AES block 3 - round N-1
trn1 $mod_constant.2d, $h1.2d, $h2.2d // Karatsuba key: H2_high | H1_high
eor $h12k.16b, $h12k.16b, $mod_constant.16b // Karatsuba key: H1^H2
b.ge .Lenc_tail // handle tail if no more full 4-block sets
ldp $res2q, $res3q, [$input_ptr, #32] // AES blocks 2,3 load plaintext
ldp $res0q, $res1q, [$input_ptr], #64 // AES blocks 0,1 load plaintext
// Compute and store first 4 ciphertext blocks
PLATFORM_EOR3($res0, $res0, $v_rkN, $ctr0) // AES block 0 - result = PT ^ AES(ctr0)
PLATFORM_EOR3($res1, $res1, $v_rkN, $ctr1) // AES block 1 - result = PT ^ AES(ctr1)
PLATFORM_EOR3($res2, $res2, $v_rkN, $ctr2) // AES block 2 - result = PT ^ AES(ctr2)
PLATFORM_EOR3($res3, $res3, $v_rkN, $ctr3) // AES block 3 - result = PT ^ AES(ctr3)
st1 { $res0.16b, $res1.16b, $res2.16b, $res3.16b}, [$output_ptr], #64 // AES blocks 0-3 - store result
// Load counter values for the second iteration from the stack
ldp $ctr0q, $ctr1q, [sp, #$ctr0_sp_offset]
ldp $ctr2q, $ctr3q, [sp, #$ctr2_sp_offset]
// Prepare and store counter values for the third iteration
rev $tmp_gpr_w, $W_T32
str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}] // ctr + 8
add $tmp_gpr_w, $W_T32, #1
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}] // ctr + 9
add $tmp_gpr_w, $W_T32, #2
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}] // ctr + 10
add $tmp_gpr_w, $W_T32, #3
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}] // ctr + 11
add $W_T32, $W_T32, #4 // Advance counter base
cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks remaining
b.ge .Lenc_prepretail // go to prepretail if < 2 full loops left
.Lenc_main_loop: // main loop start (processes 4 blocks per iteration)
// --- AES Pipeline for blocks 4k+4 to 4k+7 ---
aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 0
aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 0
aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 0
aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 0
ldr $mod_constantd, [sp, #$mod_constant_sp_offset] // Load GHASH reduction constant
aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 1
aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 1
aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 1
aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 1
// --- GHASH Pipeline (interleaved with AES) for blocks 4k to 4k+3 ---
rev64 $res0.16b, $res0.16b // GHASH block 4k - Byte swap CT
rev64 $res1.16b, $res1.16b // GHASH block 4k+1 - Byte swap CT
rev64 $res2.16b, $res2.16b // GHASH block 4k+2 - Byte swap CT
rev64 $res3.16b, $res3.16b // GHASH block 4k+3 - Byte swap CT
aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 2
ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // GHASH - prepare acc for XOR
aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 2
eor $res0.16b, $res0.16b, $acc_l.16b // GHASH block 4k - Y_i = CT_i ^ Y_{i-1}
aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 2
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low
aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 2
pmull2 $acc_h.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high
aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 3
mov $acc_md, $h34k.d[1] // GHASH block 4k - mid Karatsuba key
aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 3
mov $ghash_t0d, $res0.d[1] // GHASH block 4k - mid
aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 3
eor $ghash_t0.8b, $ghash_t0.8b, $res0.8b // GHASH block 4k - mid
aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 3
mov $ghash_t1d, $res1.d[1] // GHASH block 4k+1 - mid
aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 4
eor $ghash_t1.8b, $ghash_t1.8b, $res1.8b // GHASH block 4k+1 - mid
aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 4
pmull $acc_m.1q, $ghash_t0.1d, $acc_m.1d // GHASH block 4k - mid
aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 4
pmull $ghash_t1.1q, $ghash_t1.1d, $h34k.1d // GHASH block 4k+1 - mid
aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 4
eor $acc_m.16b, $acc_m.16b, $ghash_t1.16b // GHASH block 4k+1 - mid
aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 5
ext $ghash_t2.16b, $ghash_t2.16b, $res2.16b, #8 // GHASH block 4k+2 - mid
aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 5
eor $ghash_t2.16b, $ghash_t2.16b, $res2.16b // GHASH block 4k+2 - mid
aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 5
pmull2 $ghash_t2.1q, $ghash_t2.2d, $h12k.2d // GHASH block 4k+2 - mid
aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 5
mov $ghash_t3d, $res3.d[1] // GHASH block 4k+3 - mid
aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 6
eor $ghash_t3.8b, $ghash_t3.8b, $res3.8b // GHASH block 4k+3 - mid
aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 6
pmull $ghash_t3.1q, $ghash_t3.1d, $h12k.1d // GHASH block 4k+3 - mid
aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 6
PLATFORM_EOR3($acc_m, $acc_m, $ghash_t2, $ghash_t3) // GHASH block 4k+2/3 - mid
aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 6
pmull2 $ghash_t2.1q, $res0.2d, $h4.2d // GHASH block 4k - high
aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 7
pmull2 $ghash_t1.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high
eor $ghash_t2.16b, $ghash_t2.16b, $ghash_t1.16b // GHASH block 4k+3 - high
aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 7
pmull2 $ghash_t3.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high
aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 7
pmull $ghash_t1.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low
aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 7
pmull $ghash_t0.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low
PLATFORM_EOR3($acc_h, $acc_h, $ghash_t2, $ghash_t3) // GHASH block 4k/1/2/3 - high
pmull $ghash_t2.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low
ldp $res2q, $res3q, [$input_ptr, #32]
ldp $res0q, $res1q, [$input_ptr], #64
eor $ghash_t0.16b, $ghash_t0.16b, $ghash_t1.16b // GHASH block 4k+1 - low
aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 8
PLATFORM_EOR3($acc_l, $acc_l, $ghash_t2, $ghash_t0) // GHASH block 4k/1/2/3 - low
aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 8
PLATFORM_EOR3($acc_m, $acc_m, $acc_h, $acc_l) // MODULO - karatsuba tidy up
aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 8
pmull $ghash_t0.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid
aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 8
cmp $rounds, #12 // setup flags for AES-128/192/256 check
b.lt .Lenc_main_loop_continue // branch if AES-128
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 9
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 9
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 9
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 9
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 10
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 10
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 10
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 10
b.eq .Lenc_main_loop_continue // branch if AES-192
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 11
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 11
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 11
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 11
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 12
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 12
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 12
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 12
.Lenc_main_loop_continue:
ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8 // MODULO - other top alignment
PLATFORM_EOR3($acc_m, $acc_m, $ghash_t0, $acc_h) // MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low
ext $ghash_t0.16b, $acc_m.16b, $acc_m.16b, #8 // MODULO - other mid alignment
PLATFORM_EOR3($acc_l, $acc_h, $acc_l, $ghash_t0) // MODULO - fold into low
aese $ctr0.16b, $rkNm1.16b // AES block 4k+4 - round N-1
PLATFORM_EOR3($res0, $res0, $v_rkN, $ctr0) // AES block 4k+4 - result
aese $ctr1.16b, $rkNm1.16b // AES block 4k+5 - round N-1
PLATFORM_EOR3($res1, $res1, $v_rkN, $ctr1) // AES block 4k+5 - result
aese $ctr2.16b, $rkNm1.16b // AES block 4k+6 - round N-1
PLATFORM_EOR3($res2, $res2, $v_rkN, $ctr2) // AES block 4k+6 - result
aese $ctr3.16b, $rkNm1.16b // AES block 4k+7 - round N-1
PLATFORM_EOR3($res3, $res3, $v_rkN, $ctr3) // AES block 4k+7 - result
ldp $ctr0q, $ctr1q, [sp, #$ctr0_sp_offset]
ldp $ctr2q, $ctr3q, [sp, #$ctr2_sp_offset]
// We used these registers as temporaries above so reload the RKs.
ldp $rk2q, $rk3q, [$cc, #32] // load rk2, rk3
ldp $rk4q, $rk5q, [$cc, #64] // load rk4, rk5
st1 { $res0.16b, $res1.16b, $res2.16b, $res3.16b}, [$output_ptr], #64 // AES blocks 4k+4-7 - store result
rev $tmp_gpr_w, $W_T32
str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}]
add $tmp_gpr_w, $W_T32, #1
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}]
add $tmp_gpr_w, $W_T32, #2
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}]
add $tmp_gpr_w, $W_T32, #3
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}]
add $W_T32, $W_T32, #4
cmp $input_ptr, $main_end_input_ptr // .LOOP CONTROL
b.lt .Lenc_main_loop
.Lenc_prepretail: // PREPRETAIL
aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 0
rev64 $res2.16b, $res2.16b // GHASH block 2
aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 0
aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 0
rev64 $res0.16b, $res0.16b // GHASH block 0
ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // PRE 0
aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 1
aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 1
eor $res0.16b, $res0.16b, $acc_l.16b // PRE 1
rev64 $res1.16b, $res1.16b // GHASH block 1
aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 2
aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 0
mov $acc_md, $h34k.d[1] // GHASH block 0 - mid Karatsuba key
aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 1
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 0 - low
mov $mod_constantd, $res0.d[1] // GHASH block 0 - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 0 - high
aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 3
aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 2
eor $mod_constant.8b, $mod_constant.8b, $res0.8b // GHASH block 0 - mid
aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 2
aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 1
aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 3
pmull $acc_m.1q, $mod_constant.1d, $acc_m.1d // GHASH block 0 - mid
pmull2 $res0.1q, $res1.2d, $h3.2d // GHASH block 1 - high
pmull $mod_constant.1q, $res1.1d, $h3.1d // GHASH block 1 - low
aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 2
eor $acc_h.16b, $acc_h.16b, $res0.16b // GHASH block 1 - high
mov $res0d, $res1.d[1] // GHASH block 1 - mid
aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 3
eor $acc_l.16b, $acc_l.16b, $mod_constant.16b // GHASH block 1 - low
aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 3
eor $res0.8b, $res0.8b, $res1.8b // GHASH block 1 - mid
mov $mod_constantd, $res2.d[1] // GHASH block 2 - mid
aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 4
rev64 $res3.16b, $res3.16b // GHASH block 3
aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 4
pmull $res0.1q, $res0.1d, $h34k.1d // GHASH block 1 - mid
eor $mod_constant.8b, $mod_constant.8b, $res2.8b // GHASH block 2 - mid
pmull $res1.1q, $res2.1d, $h2.1d // GHASH block 2 - low
aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 5
aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 4
eor $acc_m.16b, $acc_m.16b, $res0.16b // GHASH block 1 - mid
pmull2 $res0.1q, $res2.2d, $h2.2d // GHASH block 2 - high
eor $acc_l.16b, $acc_l.16b, $res1.16b // GHASH block 2 - low
ins $mod_constant.d[1], $mod_constant.d[0] // GHASH block 2 - mid
aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 5
eor $acc_h.16b, $acc_h.16b, $res0.16b // GHASH block 2 - high
mov $res0d, $res3.d[1] // GHASH block 3 - mid
aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 4
pmull2 $mod_constant.1q, $mod_constant.2d, $h12k.2d // GHASH block 2 - mid
eor $res0.8b, $res0.8b, $res3.8b // GHASH block 3 - mid
pmull2 $res1.1q, $res3.2d, $h1.2d // GHASH block 3 - high
aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 5
pmull $res0.1q, $res0.1d, $h12k.1d // GHASH block 3 - mid
eor $acc_m.16b, $acc_m.16b, $mod_constant.16b // GHASH block 2 - mid
aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 5
aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 6
aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 6
aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 6
aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 6
aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 7
eor $acc_h.16b, $acc_h.16b, $res1.16b // GHASH block 3 - high
aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 7
aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 7
ldr $mod_constantd, [sp, #$mod_constant_sp_offset]
aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 8
eor $acc_m.16b, $acc_m.16b, $res0.16b // GHASH block 3 - mid
pmull $res2.1q, $res3.1d, $h1.1d // GHASH block 3 - low
aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 8
cmp $rounds, #12 // setup flags for AES-128/192/256 check
aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 8
eor $acc_l.16b, $acc_l.16b, $res2.16b // GHASH block 3 - low
aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 7
eor $acc_m.16b, $acc_m.16b, $acc_h.16b // karatsuba tidy up
aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 8
pmull $res0.1q, $acc_h.1d, $mod_constant.1d
ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8
eor $acc_m.16b, $acc_m.16b, $acc_l.16b
b.lt .Lenc_finish_prepretail // branch if AES-128
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 9
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 9
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 9
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 9
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 10
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 10
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 10
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 10
b.eq .Lenc_finish_prepretail // branch if AES-192
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 11
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 11
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 11
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 11
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 12
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 12
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 12
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 12
.Lenc_finish_prepretail:
aese $ctr0.16b, $rkNm1.16b // AES block 0 - round N-1
aese $ctr1.16b, $rkNm1.16b // AES block 1 - round N-1
aese $ctr2.16b, $rkNm1.16b // AES block 2 - round N-1
aese $ctr3.16b, $rkNm1.16b // AES block 3 - round N-1
PLATFORM_EOR3($acc_m, $acc_m, $res0, $acc_h)
pmull $res0.1q, $acc_m.1d, $mod_constant.1d
ext $acc_m.16b, $acc_m.16b, $acc_m.16b, #8
PLATFORM_EOR3($acc_l, $acc_l, $res0, $acc_m)
.Lenc_tail: // TAIL: Process remaining 0 to 3 blocks
ext $mod_constant.16b, $acc_l.16b, $acc_l.16b, #8 // Save current GHASH state for partial tag feed-in
sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process
ldp $input_l0, $input_h0, [$input_ptr], #16 // AES block 0 - load plaintext
eor $input_l0, $input_l0, $rkN_l // AES block 0 - round N low
eor $input_h0, $input_h0, $rkN_h // AES block 0 - round N high
cmp $main_end_input_ptr, #48
fmov $res0d, $input_l0 // AES block 0 - mov low
fmov $res0.d[1], $input_h0 // AES block 0 - mov high
eor $res1.16b, $res0.16b, $ctr0.16b // AES block 0 - result
b.gt .Lenc_blocks_more_than_3
cmp $main_end_input_ptr, #32
mov $ctr3.16b, $ctr2.16b
movi $acc_l.8b, #0
movi $acc_h.8b, #0
mov $ctr2.16b, $ctr1.16b
movi $acc_m.8b, #0
b.gt .Lenc_blocks_more_than_2
mov $ctr3.16b, $ctr1.16b
cmp $main_end_input_ptr, #16
b.gt .Lenc_blocks_more_than_1
b .Lenc_blocks_less_than_1
.Lenc_blocks_more_than_3: // blocks left > 3
st1 { $res1.16b}, [$output_ptr], #16 // AES final-2 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-2 block - load input low & high
rev64 $res0.16b, $res1.16b // GHASH final-3 block
eor $input_l0, $input_l0, $rkN_l // AES final-2 block - round N low
eor $res0.16b, $res0.16b, $mod_constant.16b // feed in partial tag
eor $input_h0, $input_h0, $rkN_h // AES final-2 block - round N high
mov $ghash_t2d, $res0.d[1] // GHASH final-3 block - mid
fmov $res1d, $input_l0 // AES final-2 block - mov low
fmov $res1.d[1], $input_h0 // AES final-2 block - mov high
eor $ghash_t2.8b, $ghash_t2.8b, $res0.8b // GHASH final-3 block - mid
movi $mod_constant.8b, #0 // suppress further partial tag feed in
mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low
pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high
pmull $acc_m.1q, $ghash_t2.1d, $acc_m.1d // GHASH final-3 block - mid
eor $res1.16b, $res1.16b, $ctr1.16b // AES final-2 block - result
.Lenc_blocks_more_than_2: // blocks left > 2
st1 { $res1.16b}, [$output_ptr], #16 // AES final-2 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-1 block - load input low & high
rev64 $res0.16b, $res1.16b // GHASH final-2 block
eor $input_l0, $input_l0, $rkN_l // AES final-1 block - round N low
eor $res0.16b, $res0.16b, $mod_constant.16b // feed in partial tag
fmov $res1d, $input_l0 // AES final-1 block - mov low
eor $input_h0, $input_h0, $rkN_h // AES final-1 block - round N high
fmov $res1.d[1], $input_h0 // AES final-1 block - mov high
movi $mod_constant.8b, #0 // suppress further partial tag feed in
pmull2 $ghash_t0.1q, $res0.2d, $h3.2d // GHASH final-2 block - high
mov $ghash_t2d, $res0.d[1] // GHASH final-2 block - mid
pmull $ghash_t1.1q, $res0.1d, $h3.1d // GHASH final-2 block - low
eor $ghash_t2.8b, $ghash_t2.8b, $res0.8b // GHASH final-2 block - mid
eor $res1.16b, $res1.16b, $ctr2.16b // AES final-1 block - result
eor $acc_h.16b, $acc_h.16b, $ghash_t0.16b // GHASH final-2 block - high
pmull $ghash_t2.1q, $ghash_t2.1d, $h34k.1d // GHASH final-2 block - mid
eor $acc_l.16b, $acc_l.16b, $ghash_t1.16b // GHASH final-2 block - low
eor $acc_m.16b, $acc_m.16b, $ghash_t2.16b // GHASH final-2 block - mid
.Lenc_blocks_more_than_1: // blocks left > 1
st1 { $res1.16b}, [$output_ptr], #16 // AES final-1 block - store result
rev64 $res0.16b, $res1.16b // GHASH final-1 block: Byte Swap CT
ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final block - load plaintext
eor $res0.16b, $res0.16b, $mod_constant.16b // Feed in partial tag
movi $mod_constant.8b, #0 // Clear for next block
eor $input_l0, $input_l0, $rkN_l // AES final block - round N low
mov $ghash_t2d, $res0.d[1] // GHASH final-1 block - mid
pmull2 $ghash_t0.1q, $res0.2d, $h2.2d // GHASH final-1 block - high
eor $input_h0, $input_h0, $rkN_h // AES final block - round N high
eor $ghash_t2.8b, $ghash_t2.8b, $res0.8b // GHASH final-1 block - mid
eor $acc_h.16b, $acc_h.16b, $ghash_t0.16b // GHASH final-1 block - high
ins $ghash_t2.d[1], $ghash_t2.d[0] // GHASH final-1 block - mid
fmov $res1d, $input_l0 // AES final block - mov low
fmov $res1.d[1], $input_h0 // AES final block - mov high
pmull2 $ghash_t2.1q, $ghash_t2.2d, $h12k.2d // GHASH final-1 block - mid
pmull $ghash_t1.1q, $res0.1d, $h2.1d // GHASH final-1 block - low
eor $res1.16b, $res1.16b, $ctr3.16b // AES final block - result
eor $acc_m.16b, $acc_m.16b, $ghash_t2.16b // GHASH final-1 block - mid
eor $acc_l.16b, $acc_l.16b, $ghash_t1.16b // GHASH final-1 block - low
.Lenc_blocks_less_than_1: // Last partial block handling
add $T32, $T32, $bit_length, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed
rev $W32, $W32
str $W32, [$counter, #12] // store the updated counter
and $bit_length, $bit_length, #127 // bit_length %= 128
mvn $rkN_l, xzr // Mask for low 64 bits
sub $bit_length, $bit_length, #128 //
neg $bit_length, $bit_length // Valid bits in the last block (1-128)
ldr $final_block_destq, [$output_ptr] // Load destination for merging
mvn $rkN_h, xzr // Mask for high 64 bits
and $bit_length, $bit_length, #127 // bit_length %= 128
lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block
cmp $bit_length, #64
csel $input_l0, $rkN_l, $rkN_h, lt
csel $input_h0, $rkN_h, xzr, lt
fmov $ctr0d, $input_l0 // ctr0d is mask for last block
fmov $ctr0.d[1], $input_h0
and $res1.16b, $res1.16b, $ctr0.16b // Mask out unused bits of the last CT block
rev64 $res0.16b, $res1.16b // GHASH final block - byte swap
eor $res0.16b, $res0.16b, $mod_constant.16b // Feed in partial tag
bif $res1.16b, $final_block_dest.16b, $ctr0.16b // Bitwise Insert: merge with existing data at output_ptr
pmull2 $ghash_t0.1q, $res0.2d, $h1.2d // GHASH final block - high
mov $mod_constantd, $res0.d[1] // GHASH final block - mid
pmull $ghash_t1.1q, $res0.1d, $h1.1d // GHASH final block - low
eor $acc_h.16b, $acc_h.16b, $ghash_t0.16b // GHASH final block - high
eor $mod_constant.8b, $mod_constant.8b, $res0.8b // GHASH final block - mid
pmull $mod_constant.1q, $mod_constant.1d, $h12k.1d // GHASH final block - mid
eor $acc_l.16b, $acc_l.16b, $ghash_t1.16b // GHASH final block - low
eor $acc_m.16b, $acc_m.16b, $mod_constant.16b // GHASH final block - mid
eor $res0.16b, $acc_l.16b, $acc_h.16b // MODULO - karatsuba tidy up
fmov $mod_constantd, $mod_constantx
eor $acc_m.16b, $acc_m.16b, $res0.16b // MODULO - karatsuba tidy up
pmull $res3.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid
ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8 // MODULO - other top alignment
PLATFORM_EOR3($acc_m, $acc_m, $res3, $acc_h) // MODULO - fold into mid
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low
ext $acc_m.16b, $acc_m.16b, $acc_m.16b, #8 // MODULO - other mid alignment
st1 { $res1.16b}, [$output_ptr] // store all 16B
PLATFORM_EOR3($acc_l, $acc_l, $acc_h, $acc_m) // MODULO - fold into low
ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // Byte swap GHASH result
rev64 $acc_l.16b, $acc_l.16b // Final Tag
mov $input_ptr, $len
st1 { $acc_l.16b }, [$current_tag] // Store final tag
ldp x19, x20, [sp, #16]
ldp $mod_constantx, x22, [sp, #32]
ldp d8, d9, [sp, #64]
ldp d10, d11, [sp, #80]
ldp d12, d13, [sp, #96]
ldp d14, d15, [sp, #112]
ldp x29, x30, [sp], #224
AARCH64_VALIDATE_LINK_REGISTER
ret
.size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel
___
################################################################################
# aes_gcm_dec_kernel
################################################################################
$code_template .= <<"___";
.global aes_gcm_dec_kernel
.type aes_gcm_dec_kernel,%function
.align 4
aes_gcm_dec_kernel:
AARCH64_SIGN_LINK_REGISTER
stp x29, x30, [sp, #-224]!
mov x29, sp
stp x19, x20, [sp, #16]
ld1 { $ctr0.16b}, [x4]
mov $ctr1.16b, $ctr0.16b
mov $ctr2.16b, $ctr0.16b
mov $ctr3.16b, $ctr0.16b
mov $counter, x4
mov $cc, x5
stp $mod_constantx, x22, [sp, #32]
stp x23, x24, [sp, #48]
stp d8, d9, [sp, #64]
stp d10, d11, [sp, #80]
stp d12, d13, [sp, #96]
stp d14, d15, [sp, #112]
ldr $rounds_w, [$cc, #240] // Load number of AES rounds
add $input_l1, $cc, $rounds, lsl #4 // borrow input_l1 for last key
ldp $rkN_l, $rkN_h, [$input_l1] // load round N keys
ldr $rkNm1q, [$input_l1, #-16] // load round N-1 keys
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // end_input_ptr
lsr $main_end_input_ptr, $bit_length, #3 // byte_len
mov $len, $main_end_input_ptr
ldr $ctr32w, [$counter, #12] // Load scalar 32-bit counter (CTR)
sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1
ldr $rk0q, [$cc, #0] // load rk0
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
rev $ctr32w, $ctr32w // Reverse it once for big-endian incrementing
uxtw $T32, $ctr32w // Zero extend reversed w9 into x10
str $ctr0q, [sp, #$ctr0_sp_offset]
str $ctr0q, [sp, #$ctr1_sp_offset]
str $ctr0q, [sp, #$ctr2_sp_offset]
str $ctr0q, [sp, #$ctr3_sp_offset]
rev $tmp_gpr_w, $ctr32w
mov $ctr0.s[3], $tmp_gpr_w
add $tmp_gpr_w, $ctr32w, #1
rev $tmp_gpr_w, $tmp_gpr_w
mov $ctr1.s[3], $tmp_gpr_w
add $tmp_gpr_w, $ctr32w, #2
rev $tmp_gpr_w, $tmp_gpr_w
mov $ctr2.s[3], $tmp_gpr_w
add $tmp_gpr_w, $ctr32w, #3
rev $tmp_gpr_w, $tmp_gpr_w
mov $ctr3.s[3], $tmp_gpr_w
add $tmp_gpr_w, $ctr32w, #4
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}]
add $tmp_gpr_w, $ctr32w, #5
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}]
add $tmp_gpr_w, $ctr32w, #6
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}]
add $tmp_gpr_w, $ctr32w, #7
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}]
add $ctr32w, $ctr32w, #8
// Pre-compute this value instead of using two instructions for moving and then shifting in the main loop
mov $mod_constantx, #0xc200000000000000
str $mod_constantx, [sp, #$mod_constant_sp_offset]
aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 0
ldp $rk1q, $rk2q, [$cc, #16] // load rk1, rk2
aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 0
ldp $rk3q, $rk4q, [$cc, #48] // load rk3, rk4
aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 0
ldp $rk5q, $rk6q, [$cc, #80] // load rk5, rk6
aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 0
ldp $h2q, $h3q, [$Htable, #32] // load h2, h3
aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 1
ldp $rk7q, $rk8q, [$cc, #112] // load rk7, rk8
aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 1
aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 1
aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 1
aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 2
ext $h3.16b, $h3.16b, $h3.16b, #8
aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 2
ext $h2.16b, $h2.16b, $h2.16b, #8
ldr $h4q, [$Htable, #80] // load h4
aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 2
aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 2
ext $h4.16b, $h4.16b, $h4.16b, #8
aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 3
ld1 { $acc_l.16b}, [$current_tag]
aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 3
ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8
aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 3
rev64 $acc_l.16b, $acc_l.16b
aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 3
trn2 $h34k.2d, $h3.2d, $h4.2d // h4l | h3l
aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 4
ldr $h1q, [$Htable] // load h1
aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 4
ext $h1.16b, $h1.16b, $h1.16b, #8
aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 4
trn1 $acc_h.2d, $h3.2d, $h4.2d // h4h | h3h
aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 4
trn2 $h12k.2d, $h1.2d, $h2.2d // h2l | h1l
aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 5
aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 5
aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 5
aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 5
aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 6
aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 6
aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 6
aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 6
aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 7
aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 7
aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 7
aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 7
aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 8
aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 8
aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 8
aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 8
cmp $rounds, #12 // setup flags for AES-128/192/256 check
b.lt .Ldec_finish_first_blocks // branch if AES-128
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 9
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 9
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 9
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 9
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 10
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 10
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 10
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 10
b.eq .Ldec_finish_first_blocks // branch if AES-192
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 11
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 11
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 11
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 11
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 12
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 12
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 12
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 12
.Ldec_finish_first_blocks:
ldr $rk9_11_tmpq, [$input_l1] // load rkN
cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks
eor $h34k.16b, $h34k.16b, $acc_h.16b // h4k | h3k
aese $ctr1.16b, $rkNm1.16b // AES block 1 - round N-1
aese $ctr2.16b, $rkNm1.16b // AES block 2 - round N-1
aese $ctr3.16b, $rkNm1.16b // AES block 3 - round N-1
aese $ctr0.16b, $rkNm1.16b // AES block 0 - round N-1
trn1 $t0.2d, $h1.2d, $h2.2d // h2h | h1h
eor $h12k.16b, $h12k.16b, $t0.16b // h2k | h1k
b.ge .Ldec_tail // handle tail
// Setup for AES blocks 0-3 is done purely on NEON side instead of mixing NEON and scalar instructions.
// This is because the final result of the AES block needs to be EORd with the final round key
// value ($v_rkN). This avoids several fmovs.
ldp $res2q, $res3q, [$input_ptr, #32] // AES blocks 2,3 load ciphertext
ldp $res0q, $res1q, [$input_ptr], #64 // AES blocks 0,1 load ciphertext
PLATFORM_EOR3($ctr0, $res0, $ctr0, $rk9_11_tmp) // AES block 0 - result
PLATFORM_EOR3($ctr1, $res1, $ctr1, $rk9_11_tmp) // AES block 1 - result
PLATFORM_EOR3($ctr2, $res2, $ctr2, $rk9_11_tmp) // AES block 2 - result
PLATFORM_EOR3($ctr3, $res3, $ctr3, $rk9_11_tmp) // AES block 3 - result
st1 { $ctr0.16b, $ctr1.16b, $ctr2.16b, $ctr3.16b}, [$output_ptr], #64 // AES blocks 0-3 - store result
ldr $ctr0q, [sp, #$ctr0_sp_offset]
ldr $ctr1q, [sp, #$ctr1_sp_offset]
ldr $ctr2q, [sp, #$ctr2_sp_offset]
ldr $ctr3q, [sp, #$ctr3_sp_offset]
rev $tmp_gpr_w, $ctr32w
str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}]
add $tmp_gpr_w, $ctr32w, #1
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}]
add $tmp_gpr_w, $ctr32w, #2
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}]
add $tmp_gpr_w, $ctr32w, #3
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}]
add $ctr32w, $ctr32w, #4
cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks
b.ge .Ldec_prepretail // do prepretail
.Ldec_main_loop: // main loop start
aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 0
aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 0
aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 0
aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 0
aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 1
aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 1
aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 1
aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 1
rev64 $res0.16b, $res0.16b // GHASH block 4k
rev64 $res1.16b, $res1.16b // GHASH block 4k+1
rev64 $res2.16b, $res2.16b // GHASH block 4k+2
rev64 $res3.16b, $res3.16b // GHASH block 4k+3
aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 2
ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // PRE 0
aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 2
eor $res0.16b, $res0.16b, $acc_l.16b // PRE 1
aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 2
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low
aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 2
pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high
aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 3
mov $acc_md, $h34k.d[1] // GHASH block 4k - mid
aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 3
mov $t0d, $res0.d[1] // GHASH block 4k - mid
aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 3
eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid
aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 3
aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 4
pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high
aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 4
eor $acc_h.16b, $acc_h.16b, $t1.16b // GHASH block 4k+1 - high
aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 4
mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid
aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 4
pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid
aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 5
pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low
aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 5
eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid
aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 5
aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 5
pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low
aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 6
PLATFORM_EOR3($acc_l, $acc_l, $t2, $t5) // GHASH block 4k+1 - low & GHASH block 4k+2 - low
aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 6
mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid
aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 6
eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid
pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid
ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid
aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 6
pmull2 $rk10_12.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high
mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid
aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 7
pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid
aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 7
pmull $rk9_11_tmp.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low
aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 7
PLATFORM_EOR3($acc_m, $acc_m, $t3, $t6) // GHASH block 4k+1 - mid & GHASH block 4k+2 - mid
aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 7
pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high
eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid
PLATFORM_EOR3($acc_h, $acc_h, $rk10_12, $t7) // GHASH block 4k+2 - high & GHASH block 4k+3 - high
aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 8
aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 8
aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 8
pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid
ldr $mod_constantd, [sp, #$mod_constant_sp_offset]
aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 8
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid
PLATFORM_EOR3($acc_m, $acc_m, $t9, $mod_t) // GHASH block 4k+3 - mid & MODULO - fold into mid
eor $acc_l.16b, $acc_l.16b, $rk9_11_tmp.16b // GHASH block 4k+3 - low
eor $t9.16b, $acc_l.16b, $acc_h.16b // MODULO - karatsuba tidy up
ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8 // MODULO - other top alignment
PLATFORM_EOR3($acc_m, $acc_m, $t9, $acc_h) // MODULO - karatsuba tidy up & MODULO - fold into mid
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low
ext $acc_m.16b, $acc_m.16b, $acc_m.16b, #8 // MODULO - other mid alignment
PLATFORM_EOR3($acc_l, $acc_l, $mod_constant, $acc_m) // MODULO - fold into low
cmp $rounds_w, #12 // setup flags for AES-128/192/256 check
b.lt .Ldec_main_loop_continue // branch if AES-128
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 9
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 9
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 9
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 9
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 10
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 10
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 10
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 10
b.eq .Ldec_main_loop_continue // branch if AES-192
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 11
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 11
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 11
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 11
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 4k+4 - round 12
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 4k+5 - round 12
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 4k+6 - round 12
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 4k+7 - round 12
.Ldec_main_loop_continue:
ldr $rk9_11_tmpq, [$input_l1] // load rkN
ldp $res2q, $res3q, [$input_ptr, #32] // AES blocks 2,3 load ciphertext
ldp $res0q, $res1q, [$input_ptr], #64 // AES blocks 0,1 load ciphertext
aese $ctr0.16b, $rkNm1.16b // AES block 4k+4 - round N-1
PLATFORM_EOR3($ctr0, $res0, $ctr0, $rk9_11_tmp) // AES block 4k+4 - result
aese $ctr1.16b, $rkNm1.16b // AES block 4k+5 - round N-1
PLATFORM_EOR3($ctr1, $res1, $ctr1, $rk9_11_tmp) // AES block 4k+5 - result
aese $ctr2.16b, $rkNm1.16b // AES block 4k+6 - round N-1
PLATFORM_EOR3($ctr2, $res2, $ctr2, $rk9_11_tmp) // AES block 4k+6 - result
aese $ctr3.16b, $rkNm1.16b // AES block 4k+7 - round N-1
PLATFORM_EOR3($ctr3, $res3, $ctr3, $rk9_11_tmp) // AES block 4k+7 - result
st1 { $ctr0.16b, $ctr1.16b, $ctr2.16b, $ctr3.16b}, [$output_ptr], #64 // AES blocks 4k+4-7 - store result
ldr $ctr0q, [sp, #$ctr0_sp_offset]
ldr $ctr1q, [sp, #$ctr1_sp_offset]
ldr $ctr2q, [sp, #$ctr2_sp_offset]
ldr $ctr3q, [sp, #$ctr3_sp_offset]
rev $tmp_gpr_w, $ctr32w
str $tmp_gpr_w, [sp, #@{[ $ctr0_sp_offset + 12 ]}]
add $tmp_gpr_w, $ctr32w, #1
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr1_sp_offset + 12 ]}]
add $tmp_gpr_w, $ctr32w, #2
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr2_sp_offset + 12 ]}]
add $tmp_gpr_w, $ctr32w, #3
rev $tmp_gpr_w, $tmp_gpr_w
str $tmp_gpr_w, [sp, #@{[ $ctr3_sp_offset + 12 ]}]
add $ctr32w, $ctr32w, #4
cmp $input_ptr, $main_end_input_ptr
b.lt .Ldec_main_loop
.Ldec_prepretail: // PREPRETAIL
rev64 $res0.16b, $res0.16b // GHASH block 0
rev64 $res1.16b, $res1.16b // GHASH block 1
ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8 // PRE 0
aese $ctr0.16b, $rk0.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 0
aese $ctr1.16b, $rk0.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 0
eor $res0.16b, $res0.16b, $acc_l.16b // PRE 1
rev64 $res2.16b, $res2.16b // GHASH block 2
aese $ctr1.16b, $rk1.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 1
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 0 - low
mov $t0d, $res0.d[1] // GHASH block 0 - mid
pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 0 - high
aese $ctr2.16b, $rk0.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 0
mov $acc_md, $h34k.d[1] // GHASH block 0 - mid
aese $ctr0.16b, $rk1.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 1
eor $t0.8b, $t0.8b, $res0.8b // GHASH block 0 - mid
pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 1 - high
aese $ctr2.16b, $rk1.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 1
rev64 $res3.16b, $res3.16b // GHASH block 3
aese $ctr3.16b, $rk0.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 0
pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 0 - mid
eor $acc_h.16b, $acc_h.16b, $t1.16b // GHASH block 1 - high
pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 1 - low
aese $ctr3.16b, $rk1.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 1
mov $t3d, $res1.d[1] // GHASH block 1 - mid
aese $ctr0.16b, $rk2.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 2
aese $ctr1.16b, $rk2.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 2
eor $acc_l.16b, $acc_l.16b, $t2.16b // GHASH block 1 - low
aese $ctr2.16b, $rk2.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 2
aese $ctr0.16b, $rk3.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 3
mov $t6d, $res2.d[1] // GHASH block 2 - mid
aese $ctr3.16b, $rk2.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 2
eor $t3.8b, $t3.8b, $res1.8b // GHASH block 1 - mid
pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 2 - low
aese $ctr0.16b, $rk4.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 4
aese $ctr3.16b, $rk3.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 3
eor $t6.8b, $t6.8b, $res2.8b // GHASH block 2 - mid
pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 1 - mid
aese $ctr0.16b, $rk5.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 5
eor $acc_l.16b, $acc_l.16b, $t5.16b // GHASH block 2 - low
aese $ctr3.16b, $rk4.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 4
pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 3 - high
eor $acc_m.16b, $acc_m.16b, $t3.16b // GHASH block 1 - mid
pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 2 - high
aese $ctr3.16b, $rk5.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 5
ins $t6.d[1], $t6.d[0] // GHASH block 2 - mid
aese $ctr2.16b, $rk3.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 3
aese $ctr1.16b, $rk3.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 3
PLATFORM_EOR3($acc_h, $acc_h, $t4, $t7) // GHASH block 2 - high & GHASH block 3 - high
pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 3 - low
aese $ctr2.16b, $rk4.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 4
mov $t9d, $res3.d[1] // GHASH block 3 - mid
aese $ctr1.16b, $rk4.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 4
pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 2 - mid
aese $ctr2.16b, $rk5.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 5
eor $t9.8b, $t9.8b, $res3.8b // GHASH block 3 - mid
aese $ctr1.16b, $rk5.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 5
aese $ctr3.16b, $rk6.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 6
eor $acc_m.16b, $acc_m.16b, $t6.16b // GHASH block 2 - mid
aese $ctr2.16b, $rk6.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 6
aese $ctr0.16b, $rk6.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 6
movi $mod_constant.8b, #0xc2
aese $ctr1.16b, $rk6.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 6
eor $acc_l.16b, $acc_l.16b, $t8.16b // GHASH block 3 - low
pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 3 - mid
aese $ctr3.16b, $rk7.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 7
aese $ctr1.16b, $rk7.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 7
aese $ctr0.16b, $rk7.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 7
eor $acc_m.16b, $acc_m.16b, $t9.16b // GHASH block 3 - mid
aese $ctr3.16b, $rk8.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 8
aese $ctr2.16b, $rk7.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 7
eor $t9.16b, $acc_l.16b, $acc_h.16b // MODULO - karatsuba tidy up
aese $ctr1.16b, $rk8.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 8
aese $ctr0.16b, $rk8.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 8
shl $mod_constantd, $mod_constantd, #56 // mod_constant
aese $ctr2.16b, $rk8.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 8
cmp $rounds_w, #12 // setup flags for AES-128/192/256 check
b.lt .Ldec_finish_prepretail // branch if AES-128
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #144] // load rk9, rk10
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 9
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 9
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 9
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 9
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 10
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 10
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 10
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 10
b.eq .Ldec_finish_prepretail // branch if AES-192
ldp $rk9_11_tmpq, $rk10_12q, [$cc, #176] // load rk11, rk12
aese $ctr0.16b, $rk9_11_tmp.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 11
aese $ctr1.16b, $rk9_11_tmp.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 11
aese $ctr2.16b, $rk9_11_tmp.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 11
aese $ctr3.16b, $rk9_11_tmp.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 11
aese $ctr0.16b, $rk10_12.16b \n aesmc $ctr0.16b, $ctr0.16b // AES block 0 - round 12
aese $ctr1.16b, $rk10_12.16b \n aesmc $ctr1.16b, $ctr1.16b // AES block 1 - round 12
aese $ctr2.16b, $rk10_12.16b \n aesmc $ctr2.16b, $ctr2.16b // AES block 2 - round 12
aese $ctr3.16b, $rk10_12.16b \n aesmc $ctr3.16b, $ctr3.16b // AES block 3 - round 12
.Ldec_finish_prepretail:
eor $acc_m.16b, $acc_m.16b, $t9.16b // MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid
ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8 // MODULO - other top alignment
PLATFORM_EOR3($acc_m, $acc_m, $mod_t, $acc_h) // MODULO - fold into mid
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low
ext $acc_m.16b, $acc_m.16b, $acc_m.16b, #8 // MODULO - other mid alignment
PLATFORM_EOR3($acc_l, $acc_l, $mod_constant, $acc_m) // MODULO - fold into low
aese $ctr1.16b, $rkNm1.16b // AES block 1 - round N-1
aese $ctr0.16b, $rkNm1.16b // AES block 0 - round N-1
aese $ctr3.16b, $rkNm1.16b // AES block 3 - round N-1
aese $ctr2.16b, $rkNm1.16b // AES block 2 - round N-1
.Ldec_tail: // TAIL
sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process
ld1 { $res1.16b}, [$input_ptr], #16 // AES block 0 - load ciphertext
eor $ctr0.16b, $res1.16b, $ctr0.16b // AES block 0 - result
mov $output_l0, $ctr0.d[0] // AES block 0 - mov low
mov $output_h0, $ctr0.d[1] // AES block 0 - mov high
ext $t0.16b, $acc_l.16b, $acc_l.16b, #8 // prepare final partial tag
eor $output_l0, $output_l0, $rkN_l // AES block 0 - round N low
eor $output_h0, $output_h0, $rkN_h // AES block 0 - round N high
cmp $main_end_input_ptr, #48
b.gt .Ldec_blocks_more_than_3
mov $ctr3.16b, $ctr2.16b
movi $acc_m.8b, #0
movi $acc_l.8b, #0
movi $acc_h.8b, #0
mov $ctr2.16b, $ctr1.16b
cmp $main_end_input_ptr, #32
b.gt .Ldec_blocks_more_than_2
mov $ctr3.16b, $ctr1.16b
cmp $main_end_input_ptr, #16
b.gt .Ldec_blocks_more_than_1
b .Ldec_blocks_less_than_1
.Ldec_blocks_more_than_3: // blocks left > 3
rev64 $res0.16b, $res1.16b // GHASH final-3 block
ld1 { $res1.16b}, [$input_ptr], #16 // AES final-2 block - load ciphertext
stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-3 block - store result
mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid
eor $res0.16b, $res0.16b, $t0.16b // feed in partial tag
eor $ctr0.16b, $res1.16b, $ctr1.16b // AES final-2 block - result
mov $rk4d, $res0.d[1] // GHASH final-3 block - mid
mov $output_l0, $ctr0.d[0] // AES final-2 block - mov low
mov $output_h0, $ctr0.d[1] // AES final-2 block - mov high
eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-3 block - mid
movi $t0.8b, #0 // suppress further partial tag feed in
pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d // GHASH final-3 block - mid
eor $output_l0, $output_l0, $rkN_l // AES final-2 block - round N low
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low
eor $output_h0, $output_h0, $rkN_h // AES final-2 block - round N high
.Ldec_blocks_more_than_2: // blocks left > 2
rev64 $res0.16b, $res1.16b // GHASH final-2 block
ld1 { $res1.16b}, [$input_ptr], #16 // AES final-1 block - load ciphertext
eor $res0.16b, $res0.16b, $t0.16b // feed in partial tag
stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-2 block - store result
eor $ctr0.16b, $res1.16b, $ctr2.16b // AES final-1 block - result
mov $rk4d, $res0.d[1] // GHASH final-2 block - mid
pmull $rk3q1, $res0.1d, $h3.1d // GHASH final-2 block - low
pmull2 $rk2.1q, $res0.2d, $h3.2d // GHASH final-2 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-2 block - mid
mov $output_l0, $ctr0.d[0] // AES final-1 block - mov low
mov $output_h0, $ctr0.d[1] // AES final-1 block - mov high
eor $acc_l.16b, $acc_l.16b, $rk3.16b // GHASH final-2 block - low
movi $t0.8b, #0 // suppress further partial tag feed in
pmull $rk4v.1q, $rk4v.1d, $h34k.1d // GHASH final-2 block - mid
eor $acc_h.16b, $acc_h.16b, $rk2.16b // GHASH final-2 block - high
eor $output_l0, $output_l0, $rkN_l // AES final-1 block - round N low
eor $acc_m.16b, $acc_m.16b, $rk4v.16b // GHASH final-2 block - mid
eor $output_h0, $output_h0, $rkN_h // AES final-1 block - round N high
.Ldec_blocks_more_than_1: // blocks left > 1
stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-1 block - store result
rev64 $res0.16b, $res1.16b // GHASH final-1 block
ld1 { $res1.16b}, [$input_ptr], #16 // AES final block - load ciphertext
eor $res0.16b, $res0.16b, $t0.16b // feed in partial tag
movi $t0.8b, #0 // suppress further partial tag feed in
mov $rk4d, $res0.d[1] // GHASH final-1 block - mid
eor $ctr0.16b, $res1.16b, $ctr3.16b // AES final block - result
pmull2 $rk2q1, $res0.2d, $h2.2d // GHASH final-1 block - high
eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-1 block - mid
pmull $rk3q1, $res0.1d, $h2.1d // GHASH final-1 block - low
mov $output_l0, $ctr0.d[0] // AES final block - mov low
ins $rk4v.d[1], $rk4v.d[0] // GHASH final-1 block - mid
mov $output_h0, $ctr0.d[1] // AES final block - mov high
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d // GHASH final-1 block - mid
eor $output_l0, $output_l0, $rkN_l // AES final block - round N low
eor $acc_l.16b, $acc_l.16b, $rk3.16b // GHASH final-1 block - low
eor $acc_h.16b, $acc_h.16b, $rk2.16b // GHASH final-1 block - high
eor $acc_m.16b, $acc_m.16b, $rk4v.16b // GHASH final-1 block - mid
eor $output_h0, $output_h0, $rkN_h // AES final block - round N high
.Ldec_blocks_less_than_1: // blocks left <= 1
add $T32, $T32, $bit_length, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed
rev $W32, $W32
str $W32, [$counter, #12] // store the updated counter
and $bit_length, $bit_length, #127 // bit_length %= 128
mvn $rkN_h, xzr // rkN_h = 0xffffffffffffffff
sub $bit_length, $bit_length, #128 // bit_length -= 128
mvn $rkN_l, xzr // rkN_l = 0xffffffffffffffff
ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] // load existing bytes we need to not overwrite
neg $bit_length, $bit_length // bit_length = 128 - #bits in input (in range [1,128])
and $bit_length, $bit_length, #127 // bit_length %= 128
lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block
cmp $bit_length, #64
csel $ctr32x, $rkN_l, $rkN_h, lt
csel $ctr96_b64x, $rkN_h, xzr, lt
fmov $ctr0d, $ctr32x // ctr0b is mask for last block
and $output_l0, $output_l0, $ctr32x
mov $ctr0.d[1], $ctr96_b64x
bic $end_input_ptr, $end_input_ptr, $ctr32x // mask out low existing bytes
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x // mask out high existing bytes
orr $output_l0, $output_l0, $end_input_ptr
and $output_h0, $output_h0, $ctr96_b64x
orr $output_h0, $output_h0, $main_end_input_ptr
and $res1.16b, $res1.16b, $ctr0.16b // possibly partial last block has zeroes in highest bits
rev64 $res0.16b, $res1.16b // GHASH final block
eor $res0.16b, $res0.16b, $t0.16b // feed in partial tag
pmull $rk3q1, $res0.1d, $h1.1d // GHASH final block - low
mov $t0d, $res0.d[1] // GHASH final block - mid
eor $t0.8b, $t0.8b, $res0.8b // GHASH final block - mid
pmull2 $rk2q1, $res0.2d, $h1.2d // GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d // GHASH final block - mid
eor $acc_h.16b, $acc_h.16b, $rk2.16b // GHASH final block - high
eor $acc_l.16b, $acc_l.16b, $rk3.16b // GHASH final block - low
eor $acc_m.16b, $acc_m.16b, $t0.16b // GHASH final block - mid
ldr $mod_constantd, [sp, #$mod_constant_sp_offset]
eor $t9.16b, $acc_l.16b, $acc_h.16b // MODULO - karatsuba tidy up
eor $acc_m.16b, $acc_m.16b, $t9.16b // MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid
ext $acc_h.16b, $acc_h.16b, $acc_h.16b, #8 // MODULO - other top alignment
eor $acc_m.16b, $acc_m.16b, $mod_t.16b // MODULO - fold into mid
eor $acc_m.16b, $acc_m.16b, $acc_h.16b // MODULO - fold into mid
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low
ext $acc_m.16b, $acc_m.16b, $acc_m.16b, #8 // MODULO - other mid alignment
eor $acc_l.16b, $acc_l.16b, $mod_constant.16b // MODULO - fold into low
stp $output_l0, $output_h0, [$output_ptr]
eor $acc_l.16b, $acc_l.16b, $acc_m.16b // MODULO - fold into low
ext $acc_l.16b, $acc_l.16b, $acc_l.16b, #8
rev64 $acc_l.16b, $acc_l.16b // Final Tag
mov x0, $len
st1 { $acc_l.16b }, [$current_tag] // Store final tag
ldp x19, x20, [sp, #16]
ldp x21, x22, [sp, #32]
ldp x23, x24, [sp, #48]
ldp d8, d9, [sp, #64]
ldp d10, d11, [sp, #80]
ldp d12, d13, [sp, #96]
ldp d14, d15, [sp, #112]
ldp x29, x30, [sp], #224
AARCH64_VALIDATE_LINK_REGISTER
ret
.size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel
___
# 1. Print directives
print $header_directives;
# 2. Print Legacy implementation
print get_transformed_code(0, $code_template);
print "\n";
# 3. Print EOR3 implementation
print get_transformed_code(1, $code_template);
print <<'___';
#endif // __ARM_MAX_ARCH__ >= 8
___
# Close the handle (which flushes to arm-xlate.pl)
close STDOUT or die "error closing STDOUT: $!";
}
}