Add optimised Aarch64 GCM.

Cost: 6.3KiB, based on the size of the .o file. (The bssl tool size
doesn't really change, probably due to padding somewhere.)

This code originally came from ARM but David has merged the AES-128 and
AES-256 specific code into a function that works across AES sizes.

Speeds from an M1 Pro:

Did 16546000 AES-128-GCM (16 bytes) seal operations in 1000018us (16545702.2 ops/sec): 264.7 MB/s
Did 10450500 AES-128-GCM (256 bytes) seal operations in 1000011us (10450385.0 ops/sec): 2675.3 MB/s
Did 2822500 AES-128-GCM (1350 bytes) seal operations in 1000042us (2822381.5 ops/sec): 3810.2 MB/s
Did 547000 AES-128-GCM (8192 bytes) seal operations in 1000826us (546548.6 ops/sec): 4477.3 MB/s
Did 279000 AES-128-GCM (16384 bytes) seal operations in 1000411us (278885.4 ops/sec): 4569.3 MB/s
Did 16991250 AES-256-GCM (16 bytes) seal operations in 1000001us (16991233.0 ops/sec): 271.9 MB/s
Did 9257000 AES-256-GCM (256 bytes) seal operations in 1000072us (9256333.5 ops/sec): 2369.6 MB/s
Did 2398000 AES-256-GCM (1350 bytes) seal operations in 1000002us (2397995.2 ops/sec): 3237.3 MB/s
Did 465000 AES-256-GCM (8192 bytes) seal operations in 1001108us (464485.4 ops/sec): 3805.1 MB/s
Did 240000 AES-256-GCM (16384 bytes) seal operations in 1002704us (239352.8 ops/sec): 3921.6 MB/s

Did 16670000 AES-128-GCM (16 bytes) seal operations in 1000054us (16669099.9 ops/sec): 266.7 MB/s
Did 11450750 AES-128-GCM (256 bytes) seal operations in 1000014us (11450589.7 ops/sec): 2931.4 MB/s
Did 3830000 AES-128-GCM (1350 bytes) seal operations in 1000097us (3829628.5 ops/sec): 5170.0 MB/s
Did 790000 AES-128-GCM (8192 bytes) seal operations in 1000379us (789700.7 ops/sec): 6469.2 MB/s
Did 400000 AES-128-GCM (16384 bytes) seal operations in 1000980us (399608.4 ops/sec): 6547.2 MB/s
Did 16877000 AES-256-GCM (16 bytes) seal operations in 1000052us (16876122.4 ops/sec): 270.0 MB/s
Did 10438000 AES-256-GCM (256 bytes) seal operations in 1000067us (10437300.7 ops/sec): 2671.9 MB/s
Did 3419000 AES-256-GCM (1350 bytes) seal operations in 1000158us (3418459.9 ops/sec): 4614.9 MB/s
Did 698000 AES-256-GCM (8192 bytes) seal operations in 1000557us (697611.4 ops/sec): 5714.8 MB/s
Did 355000 AES-256-GCM (16384 bytes) seal operations in 1001900us (354326.8 ops/sec): 5805.3 MB/s

Change-Id: Id88f6e14482f09591fe95145bf4089de1ab68380
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/55926
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/LICENSE b/LICENSE
index 49c41fa..fc1cebb 100644
--- a/LICENSE
+++ b/LICENSE
@@ -21,6 +21,7 @@
   27287199
   27287880
   27287883
+  263291445
 
   OpenSSL License
   ---------------
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index 35e4607..52b0942 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@@ -61,6 +61,7 @@
     BCM_ASM_SOURCES
 
     aesv8-armx.${ASM_EXT}
+    aesv8-gcm-armv8.${ASM_EXT}
     armv8-mont.${ASM_EXT}
     ghash-neon-armv8.${ASM_EXT}
     ghashv8-armx.${ASM_EXT}
@@ -87,6 +88,7 @@
 perlasm(aesni-x86.${ASM_EXT} aes/asm/aesni-x86.pl)
 perlasm(aesp8-ppc.${ASM_EXT} aes/asm/aesp8-ppc.pl)
 perlasm(aesv8-armx.${ASM_EXT} aes/asm/aesv8-armx.pl)
+perlasm(aesv8-gcm-armv8.${ASM_EXT} modes/asm/aesv8-gcm-armv8.pl)
 perlasm(armv4-mont.${ASM_EXT} bn/asm/armv4-mont.pl)
 perlasm(armv8-mont.${ASM_EXT} bn/asm/armv8-mont.pl)
 perlasm(bn-586.${ASM_EXT} bn/asm/bn-586.pl)
diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8.pl b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8.pl
new file mode 100644
index 0000000..bf86174
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8.pl
@@ -0,0 +1,1540 @@
+#! /usr/bin/env perl
+
+# Copyright (c) 2022, ARM Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#========================================================================
+# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+# derived from https://github.com/ARM-software/AArch64cryptolib, original
+# author Samuel Lee <Samuel.Lee@arm.com>.
+#========================================================================
+#
+# Approach - assume we don't want to reload constants, so reserve ~half of
+# vector register file for constants
+#
+# main loop to act on 4 16B blocks per iteration, and then do modulo of the
+# accumulated intermediate hashes from the 4 blocks
+#
+#  ____________________________________________________
+# |                                                    |
+# | PRE                                                |
+# |____________________________________________________|
+# |                |                |                  |
+# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
+# |________________|____(mostly)____|__________________|
+# |                                                    |
+# | MODULO                                             |
+# |____________________________________________________|
+#
+# PRE: Ensure previous generated intermediate hash is aligned and merged with
+# result for GHASH 4k+0
+#
+# EXT low_acc, low_acc, low_acc, #8
+# EOR res_curr (4k+0), res_curr (4k+0), low_acc
+#
+# CTR block: Increment and byte reverse counter in scalar registers and transfer
+# to SIMD registers
+#
+# REV     ctr32, rev_ctr32
+# ORR     ctr64, constctr96_top32, ctr32, LSL #32
+# // Keeping this in scalar registers to free up space in SIMD RF
+# INS     ctr_next.d[0], constctr96_bottom64
+# INS     ctr_next.d[1], ctr64X
+# ADD     rev_ctr32, #1
+#
+# AES block:
+#
+# Do AES encryption/decryption on CTR block X and EOR it with input block X.
+# Take 256 bytes key below for example. Doing small trick here of loading input
+# in scalar registers, EORing with last key and then transferring Given we are
+# very constrained in our ASIMD registers this is quite important
+#
+#     Encrypt:
+# LDR     input_low, [ input_ptr  ], #8
+# LDR     input_high, [ input_ptr  ], #8
+# EOR     input_low, k14_low
+# EOR     input_high, k14_high
+# INS     res_curr.d[0], input_low
+# INS     res_curr.d[1], input_high
+# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k13
+# EOR     res_curr, res_curr, ctr_curr
+# ST1     { res_curr.16b  }, [ output_ptr  ], #16
+#
+#     Decrypt:
+# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k13
+# LDR     res_curr, [ input_ptr  ], #16
+# EOR     res_curr, res_curr, ctr_curr
+# MOV     output_low, res_curr.d[0]
+# MOV     output_high, res_curr.d[1]
+# EOR     output_low, k14_low
+# EOR     output_high, k14_high
+# STP     output_low, output_high, [ output_ptr  ], #16
+#
+# GHASH block X:
+#     Do 128b karatsuba polynomial multiplication on block. We only have
+#     64b->128b polynomial multipliers, naively that means we need to do 4 64b
+#     multiplies to generate a 128b.
+#
+# multiplication:
+#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^
+#                   (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
+#
+#     The idea behind Karatsuba multiplication is that we can do just 3 64b
+#     multiplies:
+#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^
+#                   (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^
+#                   Pmull(Al,Bl))<<64
+#
+#     There is some complication here because the bit order of GHASH's PMULL is
+#     reversed compared to elsewhere, so we are multiplying with "twisted"
+#     powers of H
+#
+# Note: We can PMULL directly into the acc_x in first GHASH of the loop
+#
+# Note: For scheduling big cores we want to split the processing to happen over
+#       two loop iterations - otherwise the critical path latency dominates the
+#       performance.
+#
+#       This has a knock on effect on register pressure, so we have to be a bit
+#       more clever with our temporary registers than indicated here
+#
+# REV64   res_curr, res_curr
+# INS     t_m.d[0], res_curr.d[1]
+# EOR     t_m.8B, t_m.8B, res_curr.8B
+# PMULL2  t_h, res_curr, HX
+# PMULL   t_l, res_curr, HX
+# PMULL   t_m, t_m, HX_k
+# EOR     acc_h, acc_h, t_h
+# EOR     acc_l, acc_l, t_l
+# EOR     acc_m, acc_m, t_m
+#
+# MODULO: take the partial accumulators (~representing sum of 256b
+#         multiplication results), from GHASH and do modulo reduction on them
+#         There is some complication here because the bit order of GHASH's
+#         PMULL is reversed compared to elsewhere, so we are doing modulo with
+#         a reversed constant
+#
+# EOR     acc_m, acc_m, acc_h
+# EOR     acc_m, acc_m, acc_l                // Finish off karatsuba processing
+# PMULL   t_mod, acc_h, mod_constant
+# EXT     acc_h, acc_h, acc_h, #8
+# EOR     acc_m, acc_m, acc_h
+# EOR     acc_m, acc_m, t_mod
+# PMULL   acc_h, acc_m, mod_constant
+# EXT     acc_m, acc_m, acc_m, #8
+# EOR     acc_l, acc_l, acc_h
+# EOR     acc_l, acc_l, acc_m
+#
+# This code was then modified to merge the AES-128-GCM, AES-192-GCM, and
+# AES-256-GCM implementations into a single function to reduce size. We move the
+# last two round keys into consistent registers across all sizes, as they're
+# treated special. Then, after rounds 0 through 8, we added some branches to
+# conditionally run rounds 9-10 (AES-192 + AES-256) and 11-12 (AES-256), before
+# merging back into code which finishes up the last two rounds.
+#
+# There is a mostly decision to be made around how much parallel work goes
+# before or after the conditional part. We attempted to preserve the original
+# scheduling where possible, but it's possible other schedulings are more
+# optimal with the current ordering.
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code=<<___;
+#include <openssl/arm_arch.h>
+#if __ARM_MAX_ARCH__ >= 8
+
+.arch armv8-a+crypto
+.text
+___
+
+$input_ptr="x0";  #argument block
+$bit_length="x1";
+$output_ptr="x2";
+$current_tag="x3";
+$counter="x16";
+$cc="x8";
+
+{
+my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
+my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
+my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
+my ($output_l0,$output_h0)=map("x$_",(6..7));
+
+# rkN_l and rkN_h store the final round key, which is handled slightly
+# differently because it is EORed through general-purpose registers.
+my $ctr32w="w9";
+my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rkN_l,$rkN_h,$len)=map("x$_",(9..15));
+my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
+
+my $rounds="x17";
+my $roundsw="w17";
+
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
+my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
+my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
+
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
+my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
+
+my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
+my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
+my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
+
+my $t0="v8";
+my $t0d="d8";
+my $t1="v4";
+my $t1d="d4";
+my $t2="v8";
+my $t2d="d8";
+my $t3="v4";
+my $t3d="d4";
+my $t4="v4";
+my $t4d="d4";
+my $t5="v5";
+my $t5d="d5";
+my $t6="v8";
+my $t6d="d8";
+my $t7="v5";
+my $t7d="d5";
+my $t8="v6";
+my $t8d="d6";
+my $t9="v4";
+my $t9d="d4";
+
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
+my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
+
+my $mod_constantd="d8";
+my $mod_constant="v8";
+my $mod_t="v7";
+
+# rkNm1 stores the second-to-last round key, which is handled slightly
+# differently because it uses plain AESE instead of an AESE + AESMC macro-op.
+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rkNm1)=map("v$_.16b",(18..31));
+my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rkNm1q)=map("q$_",(18..31));
+my $rk2q1="v20.1q";
+my $rk3q1="v21.1q";
+my $rk4v="v22";
+my $rk4d="d22";
+
+################################################################################
+# size_t aes_gcm_enc_kernel(const uint8_t *in,
+#                           size_t len_bits,
+#                           uint8_t *out,
+#                           u64 *Xi,
+#                           uint8_t ivec[16],
+#                           const void *key);
+#
+$code.=<<___;
+.global aes_gcm_enc_kernel
+.type   aes_gcm_enc_kernel,%function
+.align  4
+aes_gcm_enc_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp     x19, x20, [sp, #16]
+	mov     $counter, x4
+	mov     $cc, x5
+	stp     x21, x22, [sp, #32]
+	stp     x23, x24, [sp, #48]
+	stp     d8, d9, [sp, #64]
+	stp     d10, d11, [sp, #80]
+	stp     d12, d13, [sp, #96]
+	stp     d14, d15, [sp, #112]
+	ldr	$roundsw, [$cc, #240]
+	add	$input_l1, $cc, $rounds, lsl #4                   // borrow input_l1 for last key
+	ldp     $rkN_l, $rkN_h, [$input_l1]                       // load round N keys
+	ldr     $rkNm1q, [$input_l1, #-16]                        // load round N-1 keys
+	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   // end_input_ptr
+	lsr     $main_end_input_ptr, $bit_length, #3              // byte_len
+	mov     $len, $main_end_input_ptr
+	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              // ctr96_b64, ctr96_t32
+	ld1     { $ctr0b}, [$counter]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	sub     $main_end_input_ptr, $main_end_input_ptr, #1      // byte_len - 1
+	ldr     $rk0q, [$cc, #0]                                  // load rk0
+	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	ldr     $rk7q, [$cc, #112]                                // load rk7
+	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+	lsr     $rctr32x, $ctr96_t32x, #32
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 2
+	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
+	rev     $rctr32w, $rctr32w                                // rev_ctr32
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 1
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 0
+	add     $rctr32w, $rctr32w, #1                            // increment rev_ctr32
+	rev     $ctr32w, $rctr32w                                 // CTR block 1
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 3
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 1
+	add     $rctr32w, $rctr32w, #1                            // CTR block 1
+	ldr     $rk1q, [$cc, #16]                                 // load rk1
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 1
+	rev     $ctr32w, $rctr32w                                 // CTR block 2
+	add     $rctr32w, $rctr32w, #1                            // CTR block 2
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 2
+	ldr     $rk2q, [$cc, #32]                                 // load rk2
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 2
+	rev     $ctr32w, $rctr32w                                 // CTR block 3
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 1
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 3
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 3
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 0
+	ldr     $rk3q, [$cc, #48]                                 // load rk3
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 2
+	ldr     $rk6q, [$cc, #96]                                 // load rk6
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 0
+	ldr     $rk5q, [$cc, #80]                                 // load rk5
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 1
+	ldr     $h3q, [$current_tag, #80]                         // load h3l | h3h
+	ext     $h3b, $h3b, $h3b, #8
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 0
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 1
+	ldr     $rk4q, [$cc, #64]                                 // load rk4
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 2
+	ldr     $h2q, [$current_tag, #64]                         // load h2l | h2h
+	ext     $h2b, $h2b, $h2b, #8
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 1
+	ldr     $rk12q, [$cc, #192]                               // load rk12
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 2
+	ldr     $h4q, [$current_tag, #112]                        // load h4l | h4h
+	ext     $h4b, $h4b, $h4b, #8
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 3
+	ldr     $rk11q, [$cc, #176]                               // load rk11
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 2
+	ldr     $rk8q, [$cc, #128]                                // load rk8
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 3
+	add     $rctr32w, $rctr32w, #1                            // CTR block 3
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 3
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 3
+	ld1     { $acc_lb}, [$current_tag]
+	ext     $acc_lb, $acc_lb, $acc_lb, #8
+	rev64   $acc_lb, $acc_lb
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 4
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 4
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 4
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 4
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 5
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 5
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 5
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 5
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 6
+	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      // h4l | h3l
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 6
+	ldr     $rk9q, [$cc, #144]                                // load rk9
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 6
+	ldr     $h1q, [$current_tag, #32]                         // load h1l | h1h
+	ext     $h1b, $h1b, $h1b, #8
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 6
+	ldr     $rk10q, [$cc, #160]                               // load rk10
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 7
+	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      // h4h | h3h
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 7
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 7
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 7
+	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      // h2l | h1l
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 8
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 8
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 8
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 8
+	b.lt	.Lenc_finish_first_blocks                         // branch if AES-128
+
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 9
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 9
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 10
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 10
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 10
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 10
+	b.eq	.Lenc_finish_first_blocks                         // branch if AES-192
+
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 11
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 11
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 11
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 11
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 12
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 12
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 12
+
+.Lenc_finish_first_blocks:
+	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 4 blocks
+	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  // h4k | h3k
+	aese    $ctr2b, $rkNm1                                    // AES block 2 - round N-1
+	trn1    $t0.2d,    $h1.2d,    $h2.2d                      // h2h | h1h
+	aese    $ctr1b, $rkNm1                                    // AES block 1 - round N-1
+	aese    $ctr0b, $rkNm1                                    // AES block 0 - round N-1
+	aese    $ctr3b, $rkNm1                                    // AES block 3 - round N-1
+	eor     $h12k.16b, $h12k.16b, $t0.16b                     // h2k | h1k
+	b.ge    .Lenc_tail                                        // handle tail
+
+	ldp     $input_l1, $input_h1, [$input_ptr, #16]           // AES block 1 - load plaintext
+	rev     $ctr32w, $rctr32w                                 // CTR block 4
+	ldp     $input_l0, $input_h0, [$input_ptr, #0]            // AES block 0 - load plaintext
+	ldp     $input_l3, $input_h3, [$input_ptr, #48]           // AES block 3 - load plaintext
+	ldp     $input_l2, $input_h2, [$input_ptr, #32]           // AES block 2 - load plaintext
+	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
+	eor     $input_l1, $input_l1, $rkN_l                      // AES block 1 - round N low
+	eor     $input_h1, $input_h1, $rkN_h                      // AES block 1 - round N high
+	fmov    $ctr_t1d, $input_l1                               // AES block 1 - mov low
+	eor     $input_l0, $input_l0, $rkN_l                      // AES block 0 - round N low
+	eor     $input_h0, $input_h0, $rkN_h                      // AES block 0 - round N high
+	eor     $input_h3, $input_h3, $rkN_h                      // AES block 3 - round N high
+	fmov    $ctr_t0d, $input_l0                               // AES block 0 - mov low
+	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 8 blocks
+	fmov    $ctr_t0.d[1], $input_h0                           // AES block 0 - mov high
+	eor     $input_l3, $input_l3, $rkN_l                      // AES block 3 - round N low
+	eor     $input_l2, $input_l2, $rkN_l                      // AES block 2 - round N low
+	fmov    $ctr_t1.d[1], $input_h1                           // AES block 1 - mov high
+	fmov    $ctr_t2d, $input_l2                               // AES block 2 - mov low
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4
+	fmov    $ctr_t3d, $input_l3                               // AES block 3 - mov low
+	eor     $input_h2, $input_h2, $rkN_h                      // AES block 2 - round N high
+	fmov    $ctr_t2.d[1], $input_h2                           // AES block 2 - mov high
+	eor     $res0b, $ctr_t0b, $ctr0b                          // AES block 0 - result
+	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4
+	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4
+	rev     $ctr32w, $rctr32w                                 // CTR block 5
+	add     $rctr32w, $rctr32w, #1                            // CTR block 5
+	eor     $res1b, $ctr_t1b, $ctr1b                          // AES block 1 - result
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 5
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 5
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 5
+	rev     $ctr32w, $rctr32w                                 // CTR block 6
+	st1     { $res0b}, [$output_ptr], #16                     // AES block 0 - store result
+	fmov    $ctr_t3.d[1], $input_h3                           // AES block 3 - mov high
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 6
+	eor     $res2b, $ctr_t2b, $ctr2b                          // AES block 2 - result
+	st1     { $res1b}, [$output_ptr], #16                     // AES block 1 - store result
+	add     $rctr32w, $rctr32w, #1                            // CTR block 6
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 6
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 6
+	st1     { $res2b}, [$output_ptr], #16                     // AES block 2 - store result
+	rev     $ctr32w, $rctr32w                                 // CTR block 7
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 7
+	eor     $res3b, $ctr_t3b, $ctr3b                          // AES block 3 - result
+	st1     { $res3b}, [$output_ptr], #16                     // AES block 3 - store result
+	b.ge    .Lenc_prepretail                                  // do prepretail
+
+.Lenc_main_loop:                                                  // main loop start
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
+	rev64   $res0b, $res0b                                    // GHASH block 4k (only t0 is free)
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+3
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
+	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+3
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
+	ldp     $input_l3, $input_h3, [$input_ptr, #48]           // AES block 4k+7 - load plaintext
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
+	ldp     $input_l2, $input_h2, [$input_ptr, #32]           // AES block 4k+6 - load plaintext
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
+	eor     $res0b, $res0b, $acc_lb                           // PRE 1
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
+	eor     $input_l3, $input_l3, $rkN_l                      // AES block 4k+7 - round N low
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
+	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
+	eor     $input_h2, $input_h2, $rkN_h                      // AES block 4k+6 - round N high
+	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
+	rev64   $res1b, $res1b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
+	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
+	rev64   $res3b, $res3b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
+	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
+	rev64   $res2b, $res2b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
+	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
+	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
+	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
+	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
+	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
+	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
+	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
+	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
+	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
+	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
+	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
+	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
+	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
+	ldp     $input_l1, $input_h1, [$input_ptr, #16]           // AES block 4k+5 - load plaintext
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
+	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
+	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
+	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
+	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
+	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
+	eor     $input_l1, $input_l1, $rkN_l                      // AES block 4k+5 - round N low
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
+	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
+	eor     $input_l2, $input_l2, $rkN_l                      // AES block 4k+6 - round N low
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
+	movi    $mod_constant.8b, #0xc2
+	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
+	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	fmov    $ctr_t1d, $input_l1                               // AES block 4k+5 - mov low
+	ldp     $input_l0, $input_h0, [$input_ptr, #0]            // AES block 4k+4 - load plaintext
+	b.lt	.Lenc_main_loop_continue                          // branch if AES-128
+
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
+	b.eq	.Lenc_main_loop_continue                          // branch if AES-192
+
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
+
+.Lenc_main_loop_continue:
+	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
+	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
+	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+3
+	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
+	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
+	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+8
+	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
+	eor     $input_l0, $input_l0, $rkN_l                      // AES block 4k+4 - round N low
+	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
+	eor     $input_h0, $input_h0, $rkN_h                      // AES block 4k+4 - round N high
+	fmov    $ctr_t0d, $input_l0                               // AES block 4k+4 - mov low
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+8
+	eor     $mod_t.16b, $acc_hb, $mod_t.16b                   // MODULO - fold into mid
+	eor     $input_h1, $input_h1, $rkN_h                      // AES block 4k+5 - round N high
+	eor     $input_h3, $input_h3, $rkN_h                      // AES block 4k+7 - round N high
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+8
+	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
+	fmov    $ctr_t0.d[1], $input_h0                           // AES block 4k+4 - mov high
+	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
+	fmov    $ctr_t3d, $input_l3                               // AES block 4k+7 - mov low
+	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
+	fmov    $ctr_t1.d[1], $input_h1                           // AES block 4k+5 - mov high
+	fmov    $ctr_t2d, $input_l2                               // AES block 4k+6 - mov low
+	cmp     $input_ptr, $main_end_input_ptr                   // LOOP CONTROL
+	fmov    $ctr_t2.d[1], $input_h2                           // AES block 4k+6 - mov high
+	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d            // MODULO - mid 64b align with low
+	eor     $res0b, $ctr_t0b, $ctr0b                          // AES block 4k+4 - result
+	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4k+8
+	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4k+8
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+9
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+9
+	eor     $res1b, $ctr_t1b, $ctr1b                          // AES block 4k+5 - result
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 4k+9
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+9
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 4k+9
+	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+10
+	st1     { $res0b}, [$output_ptr], #16                     // AES block 4k+4 - store result
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+10
+	eor     $acc_lb, $acc_lb, $acc_hb                         // MODULO - fold into low
+	fmov    $ctr_t3.d[1], $input_h3                           // AES block 4k+7 - mov high
+	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
+	st1     { $res1b}, [$output_ptr], #16                     // AES block 4k+5 - store result
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+10
+	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
+	eor     $res2b, $ctr_t2b, $ctr2b                          // AES block 4k+6 - result
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 4k+10
+	st1     { $res2b}, [$output_ptr], #16                     // AES block 4k+6 - store result
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 4k+10
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+11
+	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+11
+	eor     $res3b, $ctr_t3b, $ctr3b                          // AES block 4k+7 - result
+	st1     { $res3b}, [$output_ptr], #16                     // AES block 4k+7 - store result
+	b.lt    .Lenc_main_loop
+
+.Lenc_prepretail:                                                 // PREPRETAIL
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
+	rev64   $res2b, $res2b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+3
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
+	rev64   $res0b, $res0b                                    // GHASH block 4k (only t0 is free)
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+3
+	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
+	eor     $res0b, $res0b, $acc_lb                           // PRE 1
+	rev64   $res1b, $res1b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
+	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
+	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
+	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
+	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
+	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
+	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
+	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
+	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
+	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
+	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
+	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
+	rev64   $res3b, $res3b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
+	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
+	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+3
+	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
+	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
+	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
+	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
+	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
+	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
+	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
+	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
+	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
+	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
+	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
+	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
+	movi    $mod_constant.8b, #0xc2
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
+	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
+	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
+	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
+	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
+	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
+	eor     $acc_mb, $acc_mb, $acc_hb                         // karatsuba tidy up
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
+	pmull   $t1.1q, $acc_h.1d, $mod_constant.1d
+	ext     $acc_hb, $acc_hb, $acc_hb, #8
+	eor     $acc_mb, $acc_mb, $acc_lb
+	b.lt	.Lenc_finish_prepretail                           // branch if AES-128
+
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
+	b.eq	.Lenc_finish_prepretail                           // branch if AES-192
+
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
+
+.Lenc_finish_prepretail:
+	eor     $acc_mb, $acc_mb, $t1.16b
+	eor     $acc_mb, $acc_mb, $acc_hb
+	pmull   $t1.1q, $acc_m.1d, $mod_constant.1d
+	ext     $acc_mb, $acc_mb, $acc_mb, #8
+	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
+	eor     $acc_lb, $acc_lb, $t1.16b
+	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
+	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
+	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
+	eor     $acc_lb, $acc_lb, $acc_mb
+
+.Lenc_tail:                                                       // TAIL
+	ext     $t0.16b, $acc_lb, $acc_lb, #8                     // prepare final partial tag
+	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   // main_end_input_ptr is number of bytes left to process
+	ldp     $input_l0, $input_h0, [$input_ptr], #16           // AES block 4k+4 - load plaintext
+	eor     $input_l0, $input_l0, $rkN_l                      // AES block 4k+4 - round N low
+	eor     $input_h0, $input_h0, $rkN_h                      // AES block 4k+4 - round N high
+	cmp     $main_end_input_ptr, #48
+	fmov    $ctr_t0d, $input_l0                               // AES block 4k+4 - mov low
+	fmov    $ctr_t0.d[1], $input_h0                           // AES block 4k+4 - mov high
+	eor     $res1b, $ctr_t0b, $ctr0b                          // AES block 4k+4 - result
+	b.gt    .Lenc_blocks_more_than_3
+	cmp     $main_end_input_ptr, #32
+	mov     $ctr3b, $ctr2b
+	movi    $acc_l.8b, #0
+	movi    $acc_h.8b, #0
+	sub     $rctr32w, $rctr32w, #1
+	mov     $ctr2b, $ctr1b
+	movi    $acc_m.8b, #0
+	b.gt    .Lenc_blocks_more_than_2
+	mov     $ctr3b, $ctr1b
+	sub     $rctr32w, $rctr32w, #1
+	cmp     $main_end_input_ptr, #16
+	b.gt    .Lenc_blocks_more_than_1
+	sub     $rctr32w, $rctr32w, #1
+	b       .Lenc_blocks_less_than_1
+.Lenc_blocks_more_than_3:                                        // blocks left >  3
+	st1     { $res1b}, [$output_ptr], #16                    // AES final-3 block  - store result
+	ldp     $input_l0, $input_h0, [$input_ptr], #16          // AES final-2 block - load input low & high
+	rev64   $res0b, $res1b                                   // GHASH final-3 block
+	eor     $input_l0, $input_l0, $rkN_l                     // AES final-2 block - round N low
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	eor     $input_h0, $input_h0, $rkN_h                     // AES final-2 block - round N high
+	mov     $rk4d, $res0.d[1]                                // GHASH final-3 block - mid
+	fmov    $res1d, $input_l0                                // AES final-2 block - mov low
+	fmov    $res1.d[1], $input_h0                            // AES final-2 block - mov high
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-3 block - mid
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	mov     $acc_md, $h34k.d[1]                              // GHASH final-3 block - mid
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                      // GHASH final-3 block - low
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      // GHASH final-3 block - high
+	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   // GHASH final-3 block - mid
+	eor     $res1b, $res1b, $ctr1b                           // AES final-2 block - result
+.Lenc_blocks_more_than_2:                                        // blocks left >  2
+	st1     { $res1b}, [$output_ptr], #16                    // AES final-2 block - store result
+	ldp     $input_l0, $input_h0, [$input_ptr], #16          // AES final-1 block - load input low & high
+	rev64   $res0b, $res1b                                   // GHASH final-2 block
+	eor     $input_l0, $input_l0, $rkN_l                     // AES final-1 block - round N low
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	fmov    $res1d, $input_l0                                // AES final-1 block - mov low
+	eor     $input_h0, $input_h0, $rkN_h                     // AES final-1 block - round N high
+	fmov    $res1.d[1], $input_h0                            // AES final-1 block - mov high
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	pmull2  $rk2q1, $res0.2d, $h3.2d                         // GHASH final-2 block - high
+	mov     $rk4d, $res0.d[1]                                // GHASH final-2 block - mid
+	pmull   $rk3q1, $res0.1d, $h3.1d                         // GHASH final-2 block - low
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-2 block - mid
+	eor     $res1b, $res1b, $ctr2b                           // AES final-1 block - result
+	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-2 block - high
+	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     // GHASH final-2 block - mid
+	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-2 block - low
+	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-2 block - mid
+.Lenc_blocks_more_than_1:                                        // blocks left >  1
+	st1     { $res1b}, [$output_ptr], #16                    // AES final-1 block - store result
+	rev64   $res0b, $res1b                                   // GHASH final-1 block
+	ldp     $input_l0, $input_h0, [$input_ptr], #16          // AES final block - load input low & high
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	eor     $input_l0, $input_l0, $rkN_l                     // AES final block - round N low
+	mov     $rk4d, $res0.d[1]                                // GHASH final-1 block - mid
+	pmull2  $rk2q1, $res0.2d, $h2.2d                         // GHASH final-1 block - high
+	eor     $input_h0, $input_h0, $rkN_h                     // AES final block - round N high
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-1 block - mid
+	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-1 block - high
+	ins     $rk4v.d[1], $rk4v.d[0]                           // GHASH final-1 block - mid
+	fmov    $res1d, $input_l0                                // AES final block - mov low
+	fmov    $res1.d[1], $input_h0                            // AES final block - mov high
+	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     // GHASH final-1 block - mid
+	pmull   $rk3q1, $res0.1d, $h2.1d                         // GHASH final-1 block - low
+	eor     $res1b, $res1b, $ctr3b                           // AES final block - result
+	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-1 block - mid
+	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-1 block - low
+.Lenc_blocks_less_than_1:                                        // blocks left <= 1
+	and     $bit_length, $bit_length, #127                   // bit_length %= 128
+	mvn     $rkN_l, xzr                                      // rkN_l = 0xffffffffffffffff
+	sub     $bit_length, $bit_length, #128                   // bit_length -= 128
+	neg     $bit_length, $bit_length                         // bit_length = 128 - #bits in input (in range [1,128])
+	ld1     { $rk0}, [$output_ptr]                           // load existing bytes where the possibly partial last block is to be stored
+	mvn     $rkN_h, xzr                                      // rkN_h = 0xffffffffffffffff
+	and     $bit_length, $bit_length, #127                   // bit_length %= 128
+	lsr     $rkN_h, $rkN_h, $bit_length                      // rkN_h is mask for top 64b of last block
+	cmp     $bit_length, #64
+	csel    $input_l0, $rkN_l, $rkN_h, lt
+	csel    $input_h0, $rkN_h, xzr, lt
+	fmov    $ctr0d, $input_l0                                // ctr0b is mask for last block
+	fmov    $ctr0.d[1], $input_h0
+	and     $res1b, $res1b, $ctr0b                           // possibly partial last block has zeroes in highest bits
+	rev64   $res0b, $res1b                                   // GHASH final block
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	bif     $res1b, $rk0, $ctr0b                             // insert existing bytes in top end of result before storing
+	pmull2  $rk2q1, $res0.2d, $h1.2d                         // GHASH final block - high
+	mov     $t0d, $res0.d[1]                                 // GHASH final block - mid
+	rev     $ctr32w, $rctr32w
+	pmull   $rk3q1, $res0.1d, $h1.1d                         // GHASH final block - low
+	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final block - high
+	eor     $t0.8b, $t0.8b, $res0.8b                         // GHASH final block - mid
+	pmull   $t0.1q, $t0.1d, $h12k.1d                         // GHASH final block - mid
+	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final block - low
+	eor     $acc_mb, $acc_mb, $t0.16b                        // GHASH final block - mid
+	movi    $mod_constant.8b, #0xc2
+	eor     $t9.16b, $acc_lb, $acc_hb                        // MODULO - karatsuba tidy up
+	shl     $mod_constantd, $mod_constantd, #56              // mod_constant
+	eor     $acc_mb, $acc_mb, $t9.16b                        // MODULO - karatsuba tidy up
+	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d           // MODULO - top 64b align with mid
+	ext     $acc_hb, $acc_hb, $acc_hb, #8                    // MODULO - other top alignment
+	eor     $acc_mb, $acc_mb, $mod_t.16b                     // MODULO - fold into mid
+	eor     $acc_mb, $acc_mb, $acc_hb                        // MODULO - fold into mid
+	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d           // MODULO - mid 64b align with low
+	ext     $acc_mb, $acc_mb, $acc_mb, #8                    // MODULO - other mid alignment
+	str     $ctr32w, [$counter, #12]                         // store the updated counter
+	st1     { $res1b}, [$output_ptr]                         // store all 16B
+	eor     $acc_lb, $acc_lb, $acc_hb                        // MODULO - fold into low
+	eor     $acc_lb, $acc_lb, $acc_mb                        // MODULO - fold into low
+	ext     $acc_lb, $acc_lb, $acc_lb, #8
+	rev64   $acc_lb, $acc_lb
+	mov     x0, $len
+	st1     { $acc_l.16b }, [$current_tag]
+	ldp     x19, x20, [sp, #16]
+	ldp     x21, x22, [sp, #32]
+	ldp     x23, x24, [sp, #48]
+	ldp     d8, d9, [sp, #64]
+	ldp     d10, d11, [sp, #80]
+	ldp     d12, d13, [sp, #96]
+	ldp     d14, d15, [sp, #112]
+	ldp     x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel
+___
+
+{
+my $t8="v4";
+my $t8d="d4";
+my $t9="v6";
+my $t9d="d6";
+################################################################################
+# size_t aes_gcm_dec_kernel(const uint8_t *in,
+#                           size_t len_bits,
+#                           uint8_t *out,
+#                           u64 *Xi,
+#                           uint8_t ivec[16],
+#                           const void *key);
+#
+$code.=<<___;
+.global aes_gcm_dec_kernel
+.type   aes_gcm_dec_kernel,%function
+.align  4
+aes_gcm_dec_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp     x19, x20, [sp, #16]
+	mov     $counter, x4
+	mov     $cc, x5
+	stp     x21, x22, [sp, #32]
+	stp     x23, x24, [sp, #48]
+	stp     d8, d9, [sp, #64]
+	stp     d10, d11, [sp, #80]
+	stp     d12, d13, [sp, #96]
+	stp     d14, d15, [sp, #112]
+	ldr	$roundsw, [$cc, #240]
+	add	$input_l1, $cc, $rounds, lsl #4                   // borrow input_l1 for last key
+	ldp     $rkN_l, $rkN_h, [$input_l1]                       // load round N keys
+	ldr     $rkNm1q, [$input_l1, #-16]                        // load round N-1 keys
+	lsr     $main_end_input_ptr, $bit_length, #3              // byte_len
+	mov     $len, $main_end_input_ptr
+	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              // ctr96_b64, ctr96_t32
+	ldr     $rk8q, [$cc, #128]                                // load rk8
+	sub     $main_end_input_ptr, $main_end_input_ptr, #1      // byte_len - 1
+	ldr     $rk7q, [$cc, #112]                                // load rk7
+	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   // end_input_ptr
+	ldr     $rk6q, [$cc, #96]                                 // load rk6
+	lsr     $rctr32x, $ctr96_t32x, #32
+	ldr     $rk5q, [$cc, #80]                                 // load rk5
+	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
+	ldr     $rk3q, [$cc, #48]                                 // load rk3
+	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+	rev     $rctr32w, $rctr32w                                // rev_ctr32
+	add     $rctr32w, $rctr32w, #1                            // increment rev_ctr32
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 3
+	rev     $ctr32w, $rctr32w                                 // CTR block 1
+	add     $rctr32w, $rctr32w, #1                            // CTR block 1
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 1
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 1
+	ld1     { $ctr0b}, [$counter]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 1
+	rev     $ctr32w, $rctr32w                                 // CTR block 2
+	add     $rctr32w, $rctr32w, #1                            // CTR block 2
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 2
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 2
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 2
+	rev     $ctr32w, $rctr32w                                 // CTR block 3
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 3
+	ldr     $rk0q, [$cc, #0]                                  // load rk0
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 3
+	add     $rctr32w, $rctr32w, #1                            // CTR block 3
+	ldr     $rk4q, [$cc, #64]                                 // load rk4
+	ldr     $rk1q, [$cc, #16]                                 // load rk1
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 0
+	ldr     $h3q, [$current_tag, #80]                         // load h3l | h3h
+	ext     $h3b, $h3b, $h3b, #8
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 0
+	ldr     $h4q, [$current_tag, #112]                        // load h4l | h4h
+	ext     $h4b, $h4b, $h4b, #8
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 0
+	ldr     $h2q, [$current_tag, #64]                         // load h2l | h2h
+	ext     $h2b, $h2b, $h2b, #8
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 0
+	ldr     $rk2q, [$cc, #32]                                 // load rk2
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 1
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 1
+	ld1     { $acc_lb}, [$current_tag]
+	ext     $acc_lb, $acc_lb, $acc_lb, #8
+	rev64   $acc_lb, $acc_lb
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 1
+	ldr     $rk9q, [$cc, #144]                                // load rk9
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 1
+	ldr     $rk12q, [$cc, #192]                               // load rk12
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 2
+	ldr     $h1q, [$current_tag, #32]                         // load h1l | h1h
+	ext     $h1b, $h1b, $h1b, #8
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 2
+	ldr     $rk10q, [$cc, #160]                               // load rk10
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 2
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 3
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 2
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 3
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 4
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 3
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 3
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 4
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 4
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 4
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 5
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 5
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 5
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 5
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 6
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 6
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 6
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 6
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 7
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 7
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 7
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 8
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 7
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 8
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 8
+	ldr     $rk11q, [$cc, #176]                               // load rk11
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 8
+	b.lt	.Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 9
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 9
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 10
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 10
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 10
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 10
+	b.eq	.Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 11
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 11
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 11
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 11
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 1 - round 12
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 0 - round 12
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 2 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 3 - round 12
+
+.Ldec_finish_first_blocks:
+	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 4 blocks
+	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      // h4h | h3h
+	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      // h4l | h3l
+	trn1    $t0.2d,    $h1.2d,    $h2.2d                      // h2h | h1h
+	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      // h2l | h1l
+	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  // h4k | h3k
+	aese    $ctr1b, $rkNm1                                    // AES block 1 - round N-1
+	aese    $ctr2b, $rkNm1                                    // AES block 2 - round N-1
+	eor     $h12k.16b, $h12k.16b, $t0.16b                     // h2k | h1k
+	aese    $ctr3b, $rkNm1                                    // AES block 3 - round N-1
+	aese    $ctr0b, $rkNm1                                    // AES block 0 - round N-1
+	b.ge    .Ldec_tail                                        // handle tail
+
+	ldr     $res0q, [$input_ptr, #0]                          // AES block 0 - load ciphertext
+	ldr     $res1q, [$input_ptr, #16]                         // AES block 1 - load ciphertext
+	rev     $ctr32w, $rctr32w                                 // CTR block 4
+	eor     $ctr0b, $res0b, $ctr0b                            // AES block 0 - result
+	eor     $ctr1b, $res1b, $ctr1b                            // AES block 1 - result
+	rev64   $res1b, $res1b                                    // GHASH block 1
+	ldr     $res3q, [$input_ptr, #48]                         // AES block 3 - load ciphertext
+	mov     $output_h0, $ctr0.d[1]                            // AES block 0 - mov high
+	mov     $output_l0, $ctr0.d[0]                            // AES block 0 - mov low
+	rev64   $res0b, $res0b                                    // GHASH block 0
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4
+	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4
+	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4
+	rev     $ctr32w, $rctr32w                                 // CTR block 5
+	add     $rctr32w, $rctr32w, #1                            // CTR block 5
+	mov     $output_l1, $ctr1.d[0]                            // AES block 1 - mov low
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 5
+	mov     $output_h1, $ctr1.d[1]                            // AES block 1 - mov high
+	eor     $output_h0, $output_h0, $rkN_h                    // AES block 0 - round N high
+	eor     $output_l0, $output_l0, $rkN_l                    // AES block 0 - round N low
+	stp     $output_l0, $output_h0, [$output_ptr], #16        // AES block 0 - store result
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 5
+	ldr     $res2q, [$input_ptr, #32]                         // AES block 2 - load ciphertext
+	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 5
+	rev     $ctr32w, $rctr32w                                 // CTR block 6
+	add     $rctr32w, $rctr32w, #1                            // CTR block 6
+	eor     $output_l1, $output_l1, $rkN_l                    // AES block 1 - round N low
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 6
+	eor     $output_h1, $output_h1, $rkN_h                    // AES block 1 - round N high
+	stp     $output_l1, $output_h1, [$output_ptr], #16        // AES block 1 - store result
+	eor     $ctr2b, $res2b, $ctr2b                            // AES block 2 - result
+	cmp     $input_ptr, $main_end_input_ptr                   // check if we have <= 8 blocks
+	b.ge    .Ldec_prepretail                                  // do prepretail
+
+.Ldec_main_loop:                                                  // main loop start
+	mov     $output_l2, $ctr2.d[0]                            // AES block 4k+2 - mov low
+	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
+	eor     $ctr3b, $res3b, $ctr3b                            // AES block 4k+3 - result
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
+	mov     $output_h2, $ctr2.d[1]                            // AES block 4k+2 - mov high
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 4k+6
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 4k+6
+	eor     $res0b, $res0b, $acc_lb                           // PRE 1
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+7
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
+	mov     $output_h3, $ctr3.d[1]                            // AES block 4k+3 - mov high
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
+	mov     $output_l3, $ctr3.d[0]                            // AES block 4k+3 - mov low
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
+	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+7
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+7
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+7
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
+	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
+	eor     $output_h2, $output_h2, $rkN_h                    // AES block 4k+2 - round N high
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
+	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
+	rev64   $res2b, $res2b                                    // GHASH block 4k+2
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
+	eor     $output_l2, $output_l2, $rkN_l                    // AES block 4k+2 - round N low
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
+	stp     $output_l2, $output_h2, [$output_ptr], #16        // AES block 4k+2 - store result
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
+	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
+	rev64   $res3b, $res3b                                    // GHASH block 4k+3
+	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
+	eor     $output_l3, $output_l3, $rkN_l                    // AES block 4k+3 - round N low
+	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
+	eor     $output_h3, $output_h3, $rkN_h                    // AES block 4k+3 - round N high
+	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
+	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
+	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+7
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
+	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
+	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
+	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
+	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
+	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
+	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+8
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
+	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+8
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
+	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
+	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
+	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
+	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
+	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
+	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+8
+	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
+	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
+	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
+	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
+	movi    $mod_constant.8b, #0xc2
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
+	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
+	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
+	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
+	b.lt	.Ldec_main_loop_continue                          // branch if AES-128
+
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
+	b.eq	.Ldec_main_loop_continue                          // branch if AES-192
+
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
+
+.Ldec_main_loop_continue:
+	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
+	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
+	ldr     $res0q, [$input_ptr, #0]                          // AES block 4k+4 - load ciphertext
+	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
+	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
+	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
+	ldr     $res1q, [$input_ptr, #16]                         // AES block 4k+5 - load ciphertext
+	eor     $ctr0b, $res0b, $ctr0b                            // AES block 4k+4 - result
+	stp     $output_l3, $output_h3, [$output_ptr], #16        // AES block 4k+3 - store result
+	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
+	ldr     $res3q, [$input_ptr, #48]                         // AES block 4k+7 - load ciphertext
+	ldr     $res2q, [$input_ptr, #32]                         // AES block 4k+6 - load ciphertext
+	mov     $output_h0, $ctr0.d[1]                            // AES block 4k+4 - mov high
+	eor     $acc_mb, $acc_mb, $acc_hb                         // MODULO - fold into mid
+	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
+	add     $input_ptr, $input_ptr, #64                       // AES input_ptr update
+	mov     $output_l0, $ctr0.d[0]                            // AES block 4k+4 - mov low
+	fmov    $ctr0d, $ctr96_b64x                               // CTR block 4k+8
+	fmov    $ctr0.d[1], $ctr32x                               // CTR block 4k+8
+	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     // MODULO - mid 64b align with low
+	eor     $ctr1b, $res1b, $ctr1b                            // AES block 4k+5 - result
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+9
+	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+9
+	cmp     $input_ptr, $main_end_input_ptr                   // LOOP CONTROL
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+9
+	eor     $output_l0, $output_l0, $rkN_l                    // AES block 4k+4 - round N low
+	eor     $output_h0, $output_h0, $rkN_h                    // AES block 4k+4 - round N high
+	mov     $output_h1, $ctr1.d[1]                            // AES block 4k+5 - mov high
+	eor     $ctr2b, $res2b, $ctr2b                            // AES block 4k+6 - result
+	eor     $acc_lb, $acc_lb, $mod_constant.16b               // MODULO - fold into low
+	mov     $output_l1, $ctr1.d[0]                            // AES block 4k+5 - mov low
+	fmov    $ctr1d, $ctr96_b64x                               // CTR block 4k+9
+	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
+	fmov    $ctr1.d[1], $ctr32x                               // CTR block 4k+9
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+10
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+10
+	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+10
+	rev64   $res1b, $res1b                                    // GHASH block 4k+5
+	eor     $output_h1, $output_h1, $rkN_h                    // AES block 4k+5 - round N high
+	stp     $output_l0, $output_h0, [$output_ptr], #16        // AES block 4k+4 - store result
+	eor     $output_l1, $output_l1, $rkN_l                    // AES block 4k+5 - round N low
+	stp     $output_l1, $output_h1, [$output_ptr], #16        // AES block 4k+5 - store result
+	rev64   $res0b, $res0b                                    // GHASH block 4k+4
+	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
+	b.lt    .Ldec_main_loop
+
+.Ldec_prepretail:                                                 // PREPRETAIL
+	ext     $acc_lb, $acc_lb, $acc_lb, #8                     // PRE 0
+	mov     $output_l2, $ctr2.d[0]                            // AES block 4k+2 - mov low
+	eor     $ctr3b, $res3b, $ctr3b                            // AES block 4k+3 - result
+	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 0
+	mov     $output_h2, $ctr2.d[1]                            // AES block 4k+2 - mov high
+	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 0
+	fmov    $ctr2d, $ctr96_b64x                               // CTR block 4k+6
+	fmov    $ctr2.d[1], $ctr32x                               // CTR block 4k+6
+	rev     $ctr32w, $rctr32w                                 // CTR block 4k+7
+	eor     $res0b, $res0b, $acc_lb                           // PRE 1
+	rev64   $res2b, $res2b                                    // GHASH block 4k+2
+	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            // CTR block 4k+7
+	mov     $output_l3, $ctr3.d[0]                            // AES block 4k+3 - mov low
+	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 1
+	mov     $output_h3, $ctr3.d[1]                            // AES block 4k+3 - mov high
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                       // GHASH block 4k - low
+	mov     $t0d, $res0.d[1]                                  // GHASH block 4k - mid
+	fmov    $ctr3d, $ctr96_b64x                               // CTR block 4k+7
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       // GHASH block 4k - high
+	fmov    $ctr3.d[1], $ctr32x                               // CTR block 4k+7
+	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 0
+	mov     $acc_md, $h34k.d[1]                               // GHASH block 4k - mid
+	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 1
+	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH block 4k - mid
+	pmull2  $t1.1q, $res1.2d, $h3.2d                          // GHASH block 4k+1 - high
+	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 1
+	rev64   $res3b, $res3b                                    // GHASH block 4k+3
+	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 0
+	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      // GHASH block 4k - mid
+	eor     $acc_hb, $acc_hb, $t1.16b                         // GHASH block 4k+1 - high
+	pmull   $t2.1q, $res1.1d, $h3.1d                          // GHASH block 4k+1 - low
+	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 1
+	mov     $t3d, $res1.d[1]                                  // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 2
+	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 2
+	eor     $acc_lb, $acc_lb, $t2.16b                         // GHASH block 4k+1 - low
+	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 2
+	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 3
+	mov     $t6d, $res2.d[1]                                  // GHASH block 4k+2 - mid
+	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 2
+	eor     $t3.8b, $t3.8b, $res1.8b                          // GHASH block 4k+1 - mid
+	pmull   $t5.1q, $res2.1d, $h2.1d                          // GHASH block 4k+2 - low
+	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 4
+	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 3
+	eor     $t6.8b, $t6.8b, $res2.8b                          // GHASH block 4k+2 - mid
+	pmull   $t3.1q, $t3.1d, $h34k.1d                          // GHASH block 4k+1 - mid
+	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 5
+	eor     $acc_lb, $acc_lb, $t5.16b                         // GHASH block 4k+2 - low
+	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 4
+	pmull2  $t7.1q, $res3.2d, $h1.2d                          // GHASH block 4k+3 - high
+	eor     $acc_mb, $acc_mb, $t3.16b                         // GHASH block 4k+1 - mid
+	pmull2  $t4.1q, $res2.2d, $h2.2d                          // GHASH block 4k+2 - high
+	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 5
+	ins     $t6.d[1], $t6.d[0]                                // GHASH block 4k+2 - mid
+	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 3
+	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 3
+	eor     $acc_hb, $acc_hb, $t4.16b                         // GHASH block 4k+2 - high
+	pmull   $t8.1q, $res3.1d, $h1.1d                          // GHASH block 4k+3 - low
+	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 4
+	mov     $t9d, $res3.d[1]                                  // GHASH block 4k+3 - mid
+	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 4
+	pmull2  $t6.1q, $t6.2d, $h12k.2d                          // GHASH block 4k+2 - mid
+	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 5
+	eor     $t9.8b, $t9.8b, $res3.8b                          // GHASH block 4k+3 - mid
+	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 5
+	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 6
+	eor     $acc_mb, $acc_mb, $t6.16b                         // GHASH block 4k+2 - mid
+	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 6
+	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 6
+	movi    $mod_constant.8b, #0xc2
+	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 6
+	eor     $acc_lb, $acc_lb, $t8.16b                         // GHASH block 4k+3 - low
+	pmull   $t9.1q, $t9.1d, $h12k.1d                          // GHASH block 4k+3 - mid
+	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 7
+	cmp     $rounds, #12                                      // setup flags for AES-128/192/256 check
+	eor     $acc_hb, $acc_hb, $t7.16b                         // GHASH block 4k+3 - high
+	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 7
+	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 7
+	eor     $acc_mb, $acc_mb, $t9.16b                         // GHASH block 4k+3 - mid
+	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 8
+	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 7
+	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
+	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 8
+	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 8
+	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
+	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 8
+	b.lt	.Ldec_finish_prepretail                           // branch if AES-128
+
+	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 9
+	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 9
+	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 9
+	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 9
+	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 10
+	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 10
+	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 10
+	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 10
+	b.eq	.Ldec_finish_prepretail                           // branch if AES-192
+
+	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 11
+	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 11
+	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 11
+	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          // AES block 4k+6 - round 12
+	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 11
+	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          // AES block 4k+5 - round 12
+	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          // AES block 4k+4 - round 12
+	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          // AES block 4k+7 - round 12
+
+.Ldec_finish_prepretail:
+	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
+	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
+	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
+	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
+	eor     $output_h2, $output_h2, $rkN_h                    // AES block 4k+2 - round N high
+	eor     $output_l3, $output_l3, $rkN_l                    // AES block 4k+3 - round N low
+	eor     $acc_mb, $acc_mb, $acc_hb                         // MODULO - fold into mid
+	add     $rctr32w, $rctr32w, #1                            // CTR block 4k+7
+	eor     $output_l2, $output_l2, $rkN_l                    // AES block 4k+2 - round N low
+	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     // MODULO - mid 64b align with low
+	eor     $output_h3, $output_h3, $rkN_h                    // AES block 4k+3 - round N high
+	stp     $output_l2, $output_h2, [$output_ptr], #16        // AES block 4k+2 - store result
+	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
+	stp     $output_l3, $output_h3, [$output_ptr], #16        // AES block 4k+3 - store result
+
+	eor     $acc_lb, $acc_lb, $mod_constant.16b               // MODULO - fold into low
+	aese    $ctr1b, $rkNm1                                    // AES block 4k+5 - round N-1
+	aese    $ctr0b, $rkNm1                                    // AES block 4k+4 - round N-1
+	aese    $ctr3b, $rkNm1                                    // AES block 4k+7 - round N-1
+	aese    $ctr2b, $rkNm1                                    // AES block 4k+6 - round N-1
+	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
+
+.Ldec_tail:                                                       // TAIL
+	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   // main_end_input_ptr is number of bytes left to process
+	ld1     { $res1b}, [$input_ptr], #16                      // AES block 4k+4 - load ciphertext
+	eor     $ctr0b, $res1b, $ctr0b                            // AES block 4k+4 - result
+	mov     $output_l0, $ctr0.d[0]                            // AES block 4k+4 - mov low
+	mov     $output_h0, $ctr0.d[1]                            // AES block 4k+4 - mov high
+	ext     $t0.16b, $acc_lb, $acc_lb, #8                     // prepare final partial tag
+	cmp     $main_end_input_ptr, #48
+	eor     $output_l0, $output_l0, $rkN_l                    // AES block 4k+4 - round N low
+	eor     $output_h0, $output_h0, $rkN_h                    // AES block 4k+4 - round N high
+	b.gt    .Ldec_blocks_more_than_3
+	sub     $rctr32w, $rctr32w, #1
+	mov     $ctr3b, $ctr2b
+	movi    $acc_m.8b, #0
+	movi    $acc_l.8b, #0
+	cmp     $main_end_input_ptr, #32
+	movi    $acc_h.8b, #0
+	mov     $ctr2b, $ctr1b
+	b.gt    .Ldec_blocks_more_than_2
+	sub     $rctr32w, $rctr32w, #1
+	mov     $ctr3b, $ctr1b
+	cmp     $main_end_input_ptr, #16
+	b.gt    .Ldec_blocks_more_than_1
+	sub     $rctr32w, $rctr32w, #1
+	b       .Ldec_blocks_less_than_1
+.Ldec_blocks_more_than_3:                                    // blocks left >  3
+	rev64   $res0b, $res1b                                   // GHASH final-3 block
+	ld1     { $res1b}, [$input_ptr], #16                     // AES final-2 block - load ciphertext
+	stp     $output_l0, $output_h0, [$output_ptr], #16       // AES final-3 block  - store result
+	mov     $acc_md, $h34k.d[1]                              // GHASH final-3 block - mid
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	eor     $ctr0b, $res1b, $ctr1b                           // AES final-2 block - result
+	mov     $rk4d, $res0.d[1]                                // GHASH final-3 block - mid
+	mov     $output_l0, $ctr0.d[0]                           // AES final-2 block - mov low
+	mov     $output_h0, $ctr0.d[1]                           // AES final-2 block - mov high
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-3 block - mid
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      // GHASH final-3 block - high
+	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   // GHASH final-3 block - mid
+	eor     $output_l0, $output_l0, $rkN_l                   // AES final-2 block - round N low
+	pmull   $acc_l.1q, $res0.1d, $h4.1d                      // GHASH final-3 block - low
+	eor     $output_h0, $output_h0, $rkN_h                   // AES final-2 block - round N high
+.Ldec_blocks_more_than_2:                                    // blocks left >  2
+	rev64   $res0b, $res1b                                   // GHASH final-2 block
+	ld1     { $res1b}, [$input_ptr], #16                     // AES final-1 block - load ciphertext
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	stp     $output_l0, $output_h0, [$output_ptr], #16       // AES final-2 block  - store result
+	eor     $ctr0b, $res1b, $ctr2b                           // AES final-1 block - result
+	mov     $rk4d, $res0.d[1]                                // GHASH final-2 block - mid
+	pmull   $rk3q1, $res0.1d, $h3.1d                         // GHASH final-2 block - low
+	pmull2  $rk2q1, $res0.2d, $h3.2d                         // GHASH final-2 block - high
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-2 block - mid
+	mov     $output_l0, $ctr0.d[0]                           // AES final-1 block - mov low
+	mov     $output_h0, $ctr0.d[1]                           // AES final-1 block - mov high
+	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-2 block - low
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     // GHASH final-2 block - mid
+	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-2 block - high
+	eor     $output_l0, $output_l0, $rkN_l                   // AES final-1 block - round N low
+	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-2 block - mid
+	eor     $output_h0, $output_h0, $rkN_h                   // AES final-1 block - round N high
+.Ldec_blocks_more_than_1:                                        // blocks left >  1
+	stp     $output_l0, $output_h0, [$output_ptr], #16       // AES final-1 block  - store result
+	rev64   $res0b, $res1b                                   // GHASH final-1 block
+	ld1     { $res1b}, [$input_ptr], #16                     // AES final block - load ciphertext
+	eor     $res0b, $res0b, $t0.16b                          // feed in partial tag
+	movi    $t0.8b, #0                                       // suppress further partial tag feed in
+	mov     $rk4d, $res0.d[1]                                // GHASH final-1 block - mid
+	eor     $ctr0b, $res1b, $ctr3b                           // AES final block - result
+	pmull2  $rk2q1, $res0.2d, $h2.2d                         // GHASH final-1 block - high
+	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     // GHASH final-1 block - mid
+	pmull   $rk3q1, $res0.1d, $h2.1d                         // GHASH final-1 block - low
+	mov     $output_l0, $ctr0.d[0]                           // AES final block - mov low
+	ins     $rk4v.d[1], $rk4v.d[0]                           // GHASH final-1 block - mid
+	mov     $output_h0, $ctr0.d[1]                           // AES final block - mov high
+	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     // GHASH final-1 block - mid
+	eor     $output_l0, $output_l0, $rkN_l                   // AES final block - round N low
+	eor     $acc_lb, $acc_lb, $rk3                           // GHASH final-1 block - low
+	eor     $acc_hb, $acc_hb, $rk2                           // GHASH final-1 block - high
+	eor     $acc_mb, $acc_mb, $rk4v.16b                      // GHASH final-1 block - mid
+	eor     $output_h0, $output_h0, $rkN_h                   // AES final block - round N high
+.Ldec_blocks_less_than_1:                                        // blocks left <= 1
+	and     $bit_length, $bit_length, #127                   // bit_length %= 128
+	mvn     $rkN_h, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub     $bit_length, $bit_length, #128                   // bit_length -= 128
+	mvn     $rkN_l, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp     $end_input_ptr, $main_end_input_ptr, [$output_ptr] // load existing bytes we need to not overwrite
+	neg     $bit_length, $bit_length                         // bit_length = 128 - #bits in input (in range [1,128])
+	and     $bit_length, $bit_length, #127                   // bit_length %= 128
+	lsr     $rkN_h, $rkN_h, $bit_length                      // rkN_h is mask for top 64b of last block
+	cmp     $bit_length, #64
+	csel    $ctr32x, $rkN_l, $rkN_h, lt
+	csel    $ctr96_b64x, $rkN_h, xzr, lt
+	fmov    $ctr0d, $ctr32x                                  // ctr0b is mask for last block
+	and     $output_l0, $output_l0, $ctr32x
+	mov     $ctr0.d[1], $ctr96_b64x
+	bic     $end_input_ptr, $end_input_ptr, $ctr32x          // mask out low existing bytes
+	rev     $ctr32w, $rctr32w
+	bic     $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x      // mask out high existing bytes
+	orr     $output_l0, $output_l0, $end_input_ptr
+	and     $output_h0, $output_h0, $ctr96_b64x
+	orr     $output_h0, $output_h0, $main_end_input_ptr
+	and     $res1b, $res1b, $ctr0b                            // possibly partial last block has zeroes in highest bits
+	rev64   $res0b, $res1b                                    // GHASH final block
+	eor     $res0b, $res0b, $t0.16b                           // feed in partial tag
+	pmull   $rk3q1, $res0.1d, $h1.1d                          // GHASH final block - low
+	mov     $t0d, $res0.d[1]                                  // GHASH final block - mid
+	eor     $t0.8b, $t0.8b, $res0.8b                          // GHASH final block - mid
+	pmull2  $rk2q1, $res0.2d, $h1.2d                          // GHASH final block - high
+	pmull   $t0.1q, $t0.1d, $h12k.1d                          // GHASH final block - mid
+	eor     $acc_hb, $acc_hb, $rk2                            // GHASH final block - high
+	eor     $acc_lb, $acc_lb, $rk3                            // GHASH final block - low
+	eor     $acc_mb, $acc_mb, $t0.16b                         // GHASH final block - mid
+	movi    $mod_constant.8b, #0xc2
+	eor     $t9.16b, $acc_lb, $acc_hb                         // MODULO - karatsuba tidy up
+	shl     $mod_constantd, $mod_constantd, #56               // mod_constant
+	eor     $acc_mb, $acc_mb, $t9.16b                         // MODULO - karatsuba tidy up
+	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            // MODULO - top 64b align with mid
+	ext     $acc_hb, $acc_hb, $acc_hb, #8                     // MODULO - other top alignment
+	eor     $acc_mb, $acc_mb, $mod_t.16b                      // MODULO - fold into mid
+	eor     $acc_mb, $acc_mb, $acc_hb                         // MODULO - fold into mid
+	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     // MODULO - mid 64b align with low
+	ext     $acc_mb, $acc_mb, $acc_mb, #8                     // MODULO - other mid alignment
+	eor     $acc_lb, $acc_lb, $mod_constant.16b               // MODULO - fold into low
+	stp     $output_l0, $output_h0, [$output_ptr]
+	str     $ctr32w, [$counter, #12]                          // store the updated counter
+	eor     $acc_lb, $acc_lb, $acc_mb                         // MODULO - fold into low
+	ext     $acc_lb, $acc_lb, $acc_lb, #8
+	rev64   $acc_lb, $acc_lb
+	mov     x0, $len
+	st1     { $acc_l.16b }, [$current_tag]
+	ldp     x19, x20, [sp, #16]
+	ldp     x21, x22, [sp, #32]
+	ldp     x23, x24, [sp, #48]
+	ldp     d8, d9, [sp, #64]
+	ldp     d10, d11, [sp, #80]
+	ldp     d12, d13, [sp, #96]
+	ldp     d14, d15, [sp, #112]
+	ldp     x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel
+___
+}
+}
+
+$code.=<<___;
+#endif
+___
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c
index a268bff..de8610a 100644
--- a/crypto/fipsmodule/modes/gcm.c
+++ b/crypto/fipsmodule/modes/gcm.c
@@ -146,6 +146,32 @@
 }
 #endif  // HW_GCM && X86_64
 
+#if defined(HW_GCM) && defined(OPENSSL_AARCH64)
+
+static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                             const AES_KEY *key, uint8_t ivec[16],
+                             uint64_t *Xi) {
+  const size_t len_blocks = len & kSizeTWithoutLower4Bits;
+  if (!len_blocks) {
+    return 0;
+  }
+  aes_gcm_enc_kernel(in, len_blocks * 8, out, Xi, ivec, key);
+  return len_blocks;
+}
+
+static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
+                             const AES_KEY *key, uint8_t ivec[16],
+                             uint64_t *Xi) {
+  const size_t len_blocks = len & kSizeTWithoutLower4Bits;
+  if (!len_blocks) {
+    return 0;
+  }
+  aes_gcm_dec_kernel(in, len_blocks * 8, out, Xi, ivec, key);
+  return len_blocks;
+}
+
+#endif  // HW_GCM && AARCH64
+
 void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
                        u128 *out_key, u128 out_table[16], int *out_is_avx,
                        const uint8_t gcm_key[16]) {
@@ -231,7 +257,12 @@
   CRYPTO_ghash_init(&gcm_key->gmult, &gcm_key->ghash, &gcm_key->H,
                     gcm_key->Htable, &is_avx, ghash_key);
 
-  gcm_key->use_hw_gcm_crypt = (is_avx && block_is_hwaes) ? 1 : 0;
+#if defined(OPENSSL_AARCH64)
+    gcm_key->use_hw_gcm_crypt = (gcm_pmull_capable() && block_is_hwaes) ? 1 :
+      0;
+#else
+    gcm_key->use_hw_gcm_crypt = (is_avx && block_is_hwaes) ? 1 : 0;
+#endif
 }
 
 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const AES_KEY *key,
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index d66d8ae..8d15cc6 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -209,6 +209,20 @@
   }
 #endif  // GHASH_ASM_ARM
 
+#if defined(OPENSSL_AARCH64) && defined(HW_GCM)
+  if (hwaes_capable() && gcm_pmull_capable()) {
+    static const uint8_t kKey[16] = {0};
+    uint8_t iv[16] = {0};
+
+    for (size_t key_bits = 128; key_bits <= 256; key_bits += 64) {
+      AES_KEY aes_key;
+      aes_hw_set_encrypt_key(kKey, key_bits, &aes_key);
+      CHECK_ABI(aes_gcm_enc_kernel, buf, sizeof(buf) * 8, buf, X, iv, &aes_key);
+      CHECK_ABI(aes_gcm_dec_kernel, buf, sizeof(buf) * 8, buf, X, iv, &aes_key);
+    }
+  }
+#endif
+
 #if defined(GHASH_ASM_PPC64LE)
   if (CRYPTO_is_PPC64LE_vcrypto_capable()) {
     CHECK_ABI(gcm_init_p8, Htable, kH);
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index 3cdface..8a0a75f 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -279,6 +279,7 @@
 #endif  // OPENSSL_X86
 
 #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
+
 #define GHASH_ASM_ARM
 #define GCM_FUNCREF
 
@@ -298,6 +299,15 @@
 void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
                     size_t len);
 
+#if defined(OPENSSL_AARCH64)
+#define HW_GCM
+// These functions are defined in aesv8-gcm-armv8.pl.
+void aes_gcm_enc_kernel(const uint8_t *in, uint64_t in_bits, void *out,
+                        void *Xi, uint8_t *ivec, const AES_KEY *key);
+void aes_gcm_dec_kernel(const uint8_t *in, uint64_t in_bits, void *out,
+                        void *Xi, uint8_t *ivec, const AES_KEY *key);
+#endif
+
 #elif defined(OPENSSL_PPC64LE)
 #define GHASH_ASM_PPC64LE
 #define GCM_FUNCREF