gen/crypto/chacha20_poly1305_armv8-win.S - boringssl - Git at Google

 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.

 #include <openssl/asm_base.h>

 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
 #include <openssl/arm_arch.h>
 .section	.rodata

 .align	7
 Lchacha20_consts:
 .byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
 Linc:
 .long	1,2,3,4
 Lrol8:
 .byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
 Lclamp:
 .quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC

 .text

 .def Lpoly_hash_ad_internal
    .type 32
 .endef
 .align	6
 Lpoly_hash_ad_internal:
 .cfi_startproc
 	cbnz	x4, Lpoly_hash_intro
 	ret

 Lpoly_hash_intro:
 	cmp	x4, #16
 	b.lt	Lpoly_hash_ad_tail
 	ldp	x11, x12, [x3], 16
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	sub	x4, x4, #16
 	b	Lpoly_hash_ad_internal

 Lpoly_hash_ad_tail:
 	cbz	x4, Lpoly_hash_ad_ret

 	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
 	sub	x4, x4, #1

 Lpoly_hash_tail_16_compose:
 	ext	v20.16b, v20.16b, v20.16b, #15
 	ldrb	w11, [x3, x4]
 	mov	v20.b[0], w11
 	subs	x4, x4, #1
 	b.ge	Lpoly_hash_tail_16_compose
 	mov	x11, v20.d[0]
 	mov	x12, v20.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most

 Lpoly_hash_ad_ret:
 	ret
 .cfi_endproc


 /////////////////////////////////
 //
 // void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
 //
 .globl	chacha20_poly1305_seal

 .def chacha20_poly1305_seal
    .type 32
 .endef
 .align	6
 chacha20_poly1305_seal:
 	AARCH64_SIGN_LINK_REGISTER
 .cfi_startproc
 	stp	x29, x30, [sp, #-80]!
 .cfi_def_cfa_offset	80
 .cfi_offset	w30, -72
 .cfi_offset	w29, -80
 	mov	x29, sp
     // We probably could do .cfi_def_cfa w29, 80 at this point, but since
     // we don't actually use the frame pointer like that, it's probably not
     // worth bothering.
 	stp	d8, d9, [sp, #16]
 	stp	d10, d11, [sp, #32]
 	stp	d12, d13, [sp, #48]
 	stp	d14, d15, [sp, #64]
 .cfi_offset	b15, -8
 .cfi_offset	b14, -16
 .cfi_offset	b13, -24
 .cfi_offset	b12, -32
 .cfi_offset	b11, -40
 .cfi_offset	b10, -48
 .cfi_offset	b9, -56
 .cfi_offset	b8, -64

 	adrp	x11, Lchacha20_consts
 	add	x11, x11, :lo12:Lchacha20_consts

 	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
 	ld1	{v28.16b - v30.16b}, [x5]

 	mov	x15, #1 // Prepare the Poly1305 state
 	mov	x8, #0
 	mov	x9, #0
 	mov	x10, #0

 	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
 	add	x12, x12, x2
 	mov	v31.d[0], x4  // Store the input and aad lengths
 	mov	v31.d[1], x12

 	cmp	x2, #128
 	b.le	Lseal_128 // Optimization for smaller buffers

     // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
     // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
     // the fifth block (A4-D4) horizontally.
 	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
 	mov	v4.16b, v24.16b

 	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
 	mov	v9.16b, v28.16b

 	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
 	mov	v14.16b, v29.16b

 	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
 	add	v15.4s, v15.4s, v25.4s
 	mov	v19.16b, v30.16b

 	sub	x5, x5, #32

 	mov	x6, #10

 .align	5
 Lseal_init_rounds:
 	add	v0.4s, v0.4s, v5.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	add	v3.4s, v3.4s, v8.4s
 	add	v4.4s, v4.4s, v9.4s

 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	eor	v18.16b, v18.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h
 	rev32	v18.8h, v18.8h
 	rev32	v19.8h, v19.8h

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	add	v13.4s, v13.4s, v18.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v5.16b, v5.16b, v10.16b
 	eor	v6.16b, v6.16b, v11.16b
 	eor	v7.16b, v7.16b, v12.16b
 	eor	v8.16b, v8.16b, v13.16b
 	eor	v9.16b, v9.16b, v14.16b

 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	ushr	v5.4s, v6.4s, #20
 	sli	v5.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12
 	ushr	v7.4s, v8.4s, #20
 	sli	v7.4s, v8.4s, #12
 	ushr	v8.4s, v9.4s, #20
 	sli	v8.4s, v9.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v5.4s
 	add	v2.4s, v2.4s, v6.4s
 	add	v3.4s, v3.4s, v7.4s
 	add	v4.4s, v4.4s, v8.4s

 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	eor	v18.16b, v18.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b
 	tbl	v18.16b, {v18.16b}, v26.16b
 	tbl	v19.16b, {v19.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	add	v13.4s, v13.4s, v18.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v20.16b, v20.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v6.16b, v6.16b, v12.16b
 	eor	v7.16b, v7.16b, v13.16b
 	eor	v8.16b, v8.16b, v14.16b

 	ushr	v9.4s, v8.4s, #25
 	sli	v9.4s, v8.4s, #7
 	ushr	v8.4s, v7.4s, #25
 	sli	v8.4s, v7.4s, #7
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v5.4s, #25
 	sli	v6.4s, v5.4s, #7
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7

 	ext	v9.16b, v9.16b, v9.16b, #4
 	ext	v14.16b, v14.16b, v14.16b, #8
 	ext	v19.16b, v19.16b, v19.16b, #12
 	add	v0.4s, v0.4s, v6.4s
 	add	v1.4s, v1.4s, v7.4s
 	add	v2.4s, v2.4s, v8.4s
 	add	v3.4s, v3.4s, v5.4s
 	add	v4.4s, v4.4s, v9.4s

 	eor	v18.16b, v18.16b, v0.16b
 	eor	v15.16b, v15.16b, v1.16b
 	eor	v16.16b, v16.16b, v2.16b
 	eor	v17.16b, v17.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	rev32	v18.8h, v18.8h
 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h
 	rev32	v19.8h, v19.8h

 	add	v12.4s, v12.4s, v18.4s
 	add	v13.4s, v13.4s, v15.4s
 	add	v10.4s, v10.4s, v16.4s
 	add	v11.4s, v11.4s, v17.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v6.16b, v6.16b, v12.16b
 	eor	v7.16b, v7.16b, v13.16b
 	eor	v8.16b, v8.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v9.16b, v9.16b, v14.16b

 	ushr	v20.4s, v6.4s, #20
 	sli	v20.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12
 	ushr	v7.4s, v8.4s, #20
 	sli	v7.4s, v8.4s, #12
 	ushr	v8.4s, v5.4s, #20
 	sli	v8.4s, v5.4s, #12
 	ushr	v5.4s, v9.4s, #20
 	sli	v5.4s, v9.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	add	v3.4s, v3.4s, v8.4s
 	add	v4.4s, v4.4s, v5.4s

 	eor	v18.16b, v18.16b, v0.16b
 	eor	v15.16b, v15.16b, v1.16b
 	eor	v16.16b, v16.16b, v2.16b
 	eor	v17.16b, v17.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	tbl	v18.16b, {v18.16b}, v26.16b
 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b
 	tbl	v19.16b, {v19.16b}, v26.16b

 	add	v12.4s, v12.4s, v18.4s
 	add	v13.4s, v13.4s, v15.4s
 	add	v10.4s, v10.4s, v16.4s
 	add	v11.4s, v11.4s, v17.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v20.16b, v20.16b, v12.16b
 	eor	v6.16b, v6.16b, v13.16b
 	eor	v7.16b, v7.16b, v10.16b
 	eor	v8.16b, v8.16b, v11.16b
 	eor	v5.16b, v5.16b, v14.16b

 	ushr	v9.4s, v5.4s, #25
 	sli	v9.4s, v5.4s, #7
 	ushr	v5.4s, v8.4s, #25
 	sli	v5.4s, v8.4s, #7
 	ushr	v8.4s, v7.4s, #25
 	sli	v8.4s, v7.4s, #7
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v20.4s, #25
 	sli	v6.4s, v20.4s, #7

 	ext	v9.16b, v9.16b, v9.16b, #12
 	ext	v14.16b, v14.16b, v14.16b, #8
 	ext	v19.16b, v19.16b, v19.16b, #4
 	subs	x6, x6, #1
 	b.hi	Lseal_init_rounds

 	add	v15.4s, v15.4s, v25.4s
 	mov	x11, #4
 	dup	v20.4s, w11
 	add	v25.4s, v25.4s, v20.4s

 	zip1	v20.4s, v0.4s, v1.4s
 	zip2	v21.4s, v0.4s, v1.4s
 	zip1	v22.4s, v2.4s, v3.4s
 	zip2	v23.4s, v2.4s, v3.4s

 	zip1	v0.2d, v20.2d, v22.2d
 	zip2	v1.2d, v20.2d, v22.2d
 	zip1	v2.2d, v21.2d, v23.2d
 	zip2	v3.2d, v21.2d, v23.2d

 	zip1	v20.4s, v5.4s, v6.4s
 	zip2	v21.4s, v5.4s, v6.4s
 	zip1	v22.4s, v7.4s, v8.4s
 	zip2	v23.4s, v7.4s, v8.4s

 	zip1	v5.2d, v20.2d, v22.2d
 	zip2	v6.2d, v20.2d, v22.2d
 	zip1	v7.2d, v21.2d, v23.2d
 	zip2	v8.2d, v21.2d, v23.2d

 	zip1	v20.4s, v10.4s, v11.4s
 	zip2	v21.4s, v10.4s, v11.4s
 	zip1	v22.4s, v12.4s, v13.4s
 	zip2	v23.4s, v12.4s, v13.4s

 	zip1	v10.2d, v20.2d, v22.2d
 	zip2	v11.2d, v20.2d, v22.2d
 	zip1	v12.2d, v21.2d, v23.2d
 	zip2	v13.2d, v21.2d, v23.2d

 	zip1	v20.4s, v15.4s, v16.4s
 	zip2	v21.4s, v15.4s, v16.4s
 	zip1	v22.4s, v17.4s, v18.4s
 	zip2	v23.4s, v17.4s, v18.4s

 	zip1	v15.2d, v20.2d, v22.2d
 	zip2	v16.2d, v20.2d, v22.2d
 	zip1	v17.2d, v21.2d, v23.2d
 	zip2	v18.2d, v21.2d, v23.2d

 	add	v4.4s, v4.4s, v24.4s
 	add	v9.4s, v9.4s, v28.4s
 	and	v4.16b, v4.16b, v27.16b

 	add	v0.4s, v0.4s, v24.4s
 	add	v5.4s, v5.4s, v28.4s
 	add	v10.4s, v10.4s, v29.4s
 	add	v15.4s, v15.4s, v30.4s

 	add	v1.4s, v1.4s, v24.4s
 	add	v6.4s, v6.4s, v28.4s
 	add	v11.4s, v11.4s, v29.4s
 	add	v16.4s, v16.4s, v30.4s

 	add	v2.4s, v2.4s, v24.4s
 	add	v7.4s, v7.4s, v28.4s
 	add	v12.4s, v12.4s, v29.4s
 	add	v17.4s, v17.4s, v30.4s

 	add	v3.4s, v3.4s, v24.4s
 	add	v8.4s, v8.4s, v28.4s
 	add	v13.4s, v13.4s, v29.4s
 	add	v18.4s, v18.4s, v30.4s

 	mov	x16, v4.d[0] // Move the R key to GPRs
 	mov	x17, v4.d[1]
 	mov	v27.16b, v9.16b // Store the S key

 	bl	Lpoly_hash_ad_internal

 	mov	x3, x0
 	cmp	x2, #256
 	b.le	Lseal_tail

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v0.16b
 	eor	v21.16b, v21.16b, v5.16b
 	eor	v22.16b, v22.16b, v10.16b
 	eor	v23.16b, v23.16b, v15.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v1.16b
 	eor	v21.16b, v21.16b, v6.16b
 	eor	v22.16b, v22.16b, v11.16b
 	eor	v23.16b, v23.16b, v16.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v2.16b
 	eor	v21.16b, v21.16b, v7.16b
 	eor	v22.16b, v22.16b, v12.16b
 	eor	v23.16b, v23.16b, v17.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v3.16b
 	eor	v21.16b, v21.16b, v8.16b
 	eor	v22.16b, v22.16b, v13.16b
 	eor	v23.16b, v23.16b, v18.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	sub	x2, x2, #256

 	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
 	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256

 Lseal_main_loop:
 	adrp	x11, Lchacha20_consts
 	add	x11, x11, :lo12:Lchacha20_consts

 	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
 	mov	v4.16b, v24.16b

 	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
 	mov	v9.16b, v28.16b

 	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
 	mov	v14.16b, v29.16b

 	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
 	add	v15.4s, v15.4s, v25.4s
 	mov	v19.16b, v30.16b

 	eor	v20.16b, v20.16b, v20.16b //zero
 	not	v21.16b, v20.16b // -1
 	sub	v21.4s, v25.4s, v21.4s // Add +1
 	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
 	add	v19.4s, v19.4s, v20.4s

 	sub	x5, x5, #32
 .align	5
 Lseal_main_loop_rounds:
 	add	v0.4s, v0.4s, v5.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	add	v3.4s, v3.4s, v8.4s
 	add	v4.4s, v4.4s, v9.4s

 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	eor	v18.16b, v18.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h
 	rev32	v18.8h, v18.8h
 	rev32	v19.8h, v19.8h

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	add	v13.4s, v13.4s, v18.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v5.16b, v5.16b, v10.16b
 	eor	v6.16b, v6.16b, v11.16b
 	eor	v7.16b, v7.16b, v12.16b
 	eor	v8.16b, v8.16b, v13.16b
 	eor	v9.16b, v9.16b, v14.16b

 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	ushr	v5.4s, v6.4s, #20
 	sli	v5.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12
 	ushr	v7.4s, v8.4s, #20
 	sli	v7.4s, v8.4s, #12
 	ushr	v8.4s, v9.4s, #20
 	sli	v8.4s, v9.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v5.4s
 	add	v2.4s, v2.4s, v6.4s
 	add	v3.4s, v3.4s, v7.4s
 	add	v4.4s, v4.4s, v8.4s

 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	eor	v18.16b, v18.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b
 	tbl	v18.16b, {v18.16b}, v26.16b
 	tbl	v19.16b, {v19.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	add	v13.4s, v13.4s, v18.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v20.16b, v20.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v6.16b, v6.16b, v12.16b
 	eor	v7.16b, v7.16b, v13.16b
 	eor	v8.16b, v8.16b, v14.16b

 	ushr	v9.4s, v8.4s, #25
 	sli	v9.4s, v8.4s, #7
 	ushr	v8.4s, v7.4s, #25
 	sli	v8.4s, v7.4s, #7
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v5.4s, #25
 	sli	v6.4s, v5.4s, #7
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7

 	ext	v9.16b, v9.16b, v9.16b, #4
 	ext	v14.16b, v14.16b, v14.16b, #8
 	ext	v19.16b, v19.16b, v19.16b, #12
 	ldp	x11, x12, [x3], 16
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	add	v0.4s, v0.4s, v6.4s
 	add	v1.4s, v1.4s, v7.4s
 	add	v2.4s, v2.4s, v8.4s
 	add	v3.4s, v3.4s, v5.4s
 	add	v4.4s, v4.4s, v9.4s

 	eor	v18.16b, v18.16b, v0.16b
 	eor	v15.16b, v15.16b, v1.16b
 	eor	v16.16b, v16.16b, v2.16b
 	eor	v17.16b, v17.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	rev32	v18.8h, v18.8h
 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h
 	rev32	v19.8h, v19.8h

 	add	v12.4s, v12.4s, v18.4s
 	add	v13.4s, v13.4s, v15.4s
 	add	v10.4s, v10.4s, v16.4s
 	add	v11.4s, v11.4s, v17.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v6.16b, v6.16b, v12.16b
 	eor	v7.16b, v7.16b, v13.16b
 	eor	v8.16b, v8.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v9.16b, v9.16b, v14.16b

 	ushr	v20.4s, v6.4s, #20
 	sli	v20.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12
 	ushr	v7.4s, v8.4s, #20
 	sli	v7.4s, v8.4s, #12
 	ushr	v8.4s, v5.4s, #20
 	sli	v8.4s, v5.4s, #12
 	ushr	v5.4s, v9.4s, #20
 	sli	v5.4s, v9.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	add	v3.4s, v3.4s, v8.4s
 	add	v4.4s, v4.4s, v5.4s

 	eor	v18.16b, v18.16b, v0.16b
 	eor	v15.16b, v15.16b, v1.16b
 	eor	v16.16b, v16.16b, v2.16b
 	eor	v17.16b, v17.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	tbl	v18.16b, {v18.16b}, v26.16b
 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b
 	tbl	v19.16b, {v19.16b}, v26.16b

 	add	v12.4s, v12.4s, v18.4s
 	add	v13.4s, v13.4s, v15.4s
 	add	v10.4s, v10.4s, v16.4s
 	add	v11.4s, v11.4s, v17.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v20.16b, v20.16b, v12.16b
 	eor	v6.16b, v6.16b, v13.16b
 	eor	v7.16b, v7.16b, v10.16b
 	eor	v8.16b, v8.16b, v11.16b
 	eor	v5.16b, v5.16b, v14.16b

 	ushr	v9.4s, v5.4s, #25
 	sli	v9.4s, v5.4s, #7
 	ushr	v5.4s, v8.4s, #25
 	sli	v5.4s, v8.4s, #7
 	ushr	v8.4s, v7.4s, #25
 	sli	v8.4s, v7.4s, #7
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v20.4s, #25
 	sli	v6.4s, v20.4s, #7

 	ext	v9.16b, v9.16b, v9.16b, #12
 	ext	v14.16b, v14.16b, v14.16b, #8
 	ext	v19.16b, v19.16b, v19.16b, #4
 	subs	x6, x6, #1
 	b.ge	Lseal_main_loop_rounds
 	ldp	x11, x12, [x3], 16
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	subs	x7, x7, #1
 	b.gt	Lseal_main_loop_rounds

 	eor	v20.16b, v20.16b, v20.16b //zero
 	not	v21.16b, v20.16b // -1
 	sub	v21.4s, v25.4s, v21.4s // Add +1
 	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
 	add	v19.4s, v19.4s, v20.4s

 	add	v15.4s, v15.4s, v25.4s
 	mov	x11, #5
 	dup	v20.4s, w11
 	add	v25.4s, v25.4s, v20.4s

 	zip1	v20.4s, v0.4s, v1.4s
 	zip2	v21.4s, v0.4s, v1.4s
 	zip1	v22.4s, v2.4s, v3.4s
 	zip2	v23.4s, v2.4s, v3.4s

 	zip1	v0.2d, v20.2d, v22.2d
 	zip2	v1.2d, v20.2d, v22.2d
 	zip1	v2.2d, v21.2d, v23.2d
 	zip2	v3.2d, v21.2d, v23.2d

 	zip1	v20.4s, v5.4s, v6.4s
 	zip2	v21.4s, v5.4s, v6.4s
 	zip1	v22.4s, v7.4s, v8.4s
 	zip2	v23.4s, v7.4s, v8.4s

 	zip1	v5.2d, v20.2d, v22.2d
 	zip2	v6.2d, v20.2d, v22.2d
 	zip1	v7.2d, v21.2d, v23.2d
 	zip2	v8.2d, v21.2d, v23.2d

 	zip1	v20.4s, v10.4s, v11.4s
 	zip2	v21.4s, v10.4s, v11.4s
 	zip1	v22.4s, v12.4s, v13.4s
 	zip2	v23.4s, v12.4s, v13.4s

 	zip1	v10.2d, v20.2d, v22.2d
 	zip2	v11.2d, v20.2d, v22.2d
 	zip1	v12.2d, v21.2d, v23.2d
 	zip2	v13.2d, v21.2d, v23.2d

 	zip1	v20.4s, v15.4s, v16.4s
 	zip2	v21.4s, v15.4s, v16.4s
 	zip1	v22.4s, v17.4s, v18.4s
 	zip2	v23.4s, v17.4s, v18.4s

 	zip1	v15.2d, v20.2d, v22.2d
 	zip2	v16.2d, v20.2d, v22.2d
 	zip1	v17.2d, v21.2d, v23.2d
 	zip2	v18.2d, v21.2d, v23.2d

 	add	v0.4s, v0.4s, v24.4s
 	add	v5.4s, v5.4s, v28.4s
 	add	v10.4s, v10.4s, v29.4s
 	add	v15.4s, v15.4s, v30.4s

 	add	v1.4s, v1.4s, v24.4s
 	add	v6.4s, v6.4s, v28.4s
 	add	v11.4s, v11.4s, v29.4s
 	add	v16.4s, v16.4s, v30.4s

 	add	v2.4s, v2.4s, v24.4s
 	add	v7.4s, v7.4s, v28.4s
 	add	v12.4s, v12.4s, v29.4s
 	add	v17.4s, v17.4s, v30.4s

 	add	v3.4s, v3.4s, v24.4s
 	add	v8.4s, v8.4s, v28.4s
 	add	v13.4s, v13.4s, v29.4s
 	add	v18.4s, v18.4s, v30.4s

 	add	v4.4s, v4.4s, v24.4s
 	add	v9.4s, v9.4s, v28.4s
 	add	v14.4s, v14.4s, v29.4s
 	add	v19.4s, v19.4s, v30.4s

 	cmp	x2, #320
 	b.le	Lseal_tail

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v0.16b
 	eor	v21.16b, v21.16b, v5.16b
 	eor	v22.16b, v22.16b, v10.16b
 	eor	v23.16b, v23.16b, v15.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v1.16b
 	eor	v21.16b, v21.16b, v6.16b
 	eor	v22.16b, v22.16b, v11.16b
 	eor	v23.16b, v23.16b, v16.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v2.16b
 	eor	v21.16b, v21.16b, v7.16b
 	eor	v22.16b, v22.16b, v12.16b
 	eor	v23.16b, v23.16b, v17.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v3.16b
 	eor	v21.16b, v21.16b, v8.16b
 	eor	v22.16b, v22.16b, v13.16b
 	eor	v23.16b, v23.16b, v18.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v4.16b
 	eor	v21.16b, v21.16b, v9.16b
 	eor	v22.16b, v22.16b, v14.16b
 	eor	v23.16b, v23.16b, v19.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	sub	x2, x2, #320

 	mov	x6, #0
 	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration

 	b	Lseal_main_loop

 Lseal_tail:
     // This part of the function handles the storage and authentication of the last [0,320) bytes
     // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
 	cmp	x2, #64
 	b.lt	Lseal_tail_64

     // Store and authenticate 64B blocks per iteration
 	ld1	{v20.16b - v23.16b}, [x1], #64

 	eor	v20.16b, v20.16b, v0.16b
 	eor	v21.16b, v21.16b, v5.16b
 	eor	v22.16b, v22.16b, v10.16b
 	eor	v23.16b, v23.16b, v15.16b
 	mov	x11, v20.d[0]
 	mov	x12, v20.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	mov	x11, v21.d[0]
 	mov	x12, v21.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	mov	x11, v22.d[0]
 	mov	x12, v22.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	mov	x11, v23.d[0]
 	mov	x12, v23.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	st1	{v20.16b - v23.16b}, [x0], #64
 	sub	x2, x2, #64

     // Shift the state left by 64 bytes for the next iteration of the loop
 	mov	v0.16b, v1.16b
 	mov	v5.16b, v6.16b
 	mov	v10.16b, v11.16b
 	mov	v15.16b, v16.16b

 	mov	v1.16b, v2.16b
 	mov	v6.16b, v7.16b
 	mov	v11.16b, v12.16b
 	mov	v16.16b, v17.16b

 	mov	v2.16b, v3.16b
 	mov	v7.16b, v8.16b
 	mov	v12.16b, v13.16b
 	mov	v17.16b, v18.16b

 	mov	v3.16b, v4.16b
 	mov	v8.16b, v9.16b
 	mov	v13.16b, v14.16b
 	mov	v18.16b, v19.16b

 	b	Lseal_tail

 Lseal_tail_64:
 	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr

     // Here we handle the last [0,64) bytes of plaintext
 	cmp	x2, #16
 	b.lt	Lseal_tail_16
     // Each iteration encrypt and authenticate a 16B block
 	ld1	{v20.16b}, [x1], #16
 	eor	v20.16b, v20.16b, v0.16b
 	mov	x11, v20.d[0]
 	mov	x12, v20.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	st1	{v20.16b}, [x0], #16

 	sub	x2, x2, #16

     // Shift the state left by 16 bytes for the next iteration of the loop
 	mov	v0.16b, v5.16b
 	mov	v5.16b, v10.16b
 	mov	v10.16b, v15.16b

 	b	Lseal_tail_64

 Lseal_tail_16:
     // Here we handle the last [0,16) bytes of ciphertext that require a padded block
 	cbz	x2, Lseal_hash_extra

 	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
 	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
 	not	v22.16b, v20.16b

 	mov	x6, x2
 	add	x1, x1, x2

 	cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding

 	mov	x7, #16          // We need to load some extra_in first for padding
 	sub	x7, x7, x2
 	cmp	x4, x7
 	csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
 	mov	x12, x7
 	add	x3, x3, x7
 	sub	x4, x4, x7

 Lseal_tail16_compose_extra_in:
 	ext	v20.16b, v20.16b, v20.16b, #15
 	ldrb	w11, [x3, #-1]!
 	mov	v20.b[0], w11
 	subs	x7, x7, #1
 	b.gt	Lseal_tail16_compose_extra_in

 	add	x3, x3, x12

 Lseal_tail_16_compose:
 	ext	v20.16b, v20.16b, v20.16b, #15
 	ldrb	w11, [x1, #-1]!
 	mov	v20.b[0], w11
 	ext	v21.16b, v22.16b, v21.16b, #15
 	subs	x2, x2, #1
 	b.gt	Lseal_tail_16_compose

 	and	v0.16b, v0.16b, v21.16b
 	eor	v20.16b, v20.16b, v0.16b
 	mov	v21.16b, v20.16b

 Lseal_tail_16_store:
 	umov	w11, v20.b[0]
 	strb	w11, [x0], #1
 	ext	v20.16b, v20.16b, v20.16b, #1
 	subs	x6, x6, #1
 	b.gt	Lseal_tail_16_store

     // Hash in the final ct block concatenated with extra_in
 	mov	x11, v21.d[0]
 	mov	x12, v21.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most

 Lseal_hash_extra:
 	cbz	x4, Lseal_finalize

 Lseal_hash_extra_loop:
 	cmp	x4, #16
 	b.lt	Lseal_hash_extra_tail
 	ld1	{v20.16b}, [x3], #16
 	mov	x11, v20.d[0]
 	mov	x12, v20.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	sub	x4, x4, #16
 	b	Lseal_hash_extra_loop

 Lseal_hash_extra_tail:
 	cbz	x4, Lseal_finalize
 	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
 	add	x3, x3, x4

 Lseal_hash_extra_load:
 	ext	v20.16b, v20.16b, v20.16b, #15
 	ldrb	w11, [x3, #-1]!
 	mov	v20.b[0], w11
 	subs	x4, x4, #1
 	b.gt	Lseal_hash_extra_load

     // Hash in the final padded extra_in blcok
 	mov	x11, v20.d[0]
 	mov	x12, v20.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most

 Lseal_finalize:
 	mov	x11, v31.d[0]
 	mov	x12, v31.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
     // Final reduction step
 	sub	x12, xzr, x15
 	orr	x13, xzr, #3
 	subs	x11, x8, #-5
 	sbcs	x12, x9, x12
 	sbcs	x13, x10, x13
 	csel	x8, x11, x8, cs
 	csel	x9, x12, x9, cs
 	csel	x10, x13, x10, cs
 	mov	x11, v27.d[0]
 	mov	x12, v27.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15

 	stp	x8, x9, [x5]

 	ldp	d8, d9, [sp, #16]
 	ldp	d10, d11, [sp, #32]
 	ldp	d12, d13, [sp, #48]
 	ldp	d14, d15, [sp, #64]
 .cfi_restore	b15
 .cfi_restore	b14
 .cfi_restore	b13
 .cfi_restore	b12
 .cfi_restore	b11
 .cfi_restore	b10
 .cfi_restore	b9
 .cfi_restore	b8
 	ldp	x29, x30, [sp], 80
 .cfi_restore	w29
 .cfi_restore	w30
 .cfi_def_cfa_offset	0
 	AARCH64_VALIDATE_LINK_REGISTER
 	ret

 Lseal_128:
     // On some architectures preparing 5 blocks for small buffers is wasteful
 	eor	v25.16b, v25.16b, v25.16b
 	mov	x11, #1
 	mov	v25.s[0], w11
 	mov	v0.16b, v24.16b
 	mov	v1.16b, v24.16b
 	mov	v2.16b, v24.16b
 	mov	v5.16b, v28.16b
 	mov	v6.16b, v28.16b
 	mov	v7.16b, v28.16b
 	mov	v10.16b, v29.16b
 	mov	v11.16b, v29.16b
 	mov	v12.16b, v29.16b
 	mov	v17.16b, v30.16b
 	add	v15.4s, v17.4s, v25.4s
 	add	v16.4s, v15.4s, v25.4s

 	mov	x6, #10

 Lseal_128_rounds:
 	add	v0.4s, v0.4s, v5.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v5.16b, v5.16b, v10.16b
 	eor	v6.16b, v6.16b, v11.16b
 	eor	v7.16b, v7.16b, v12.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	ushr	v5.4s, v6.4s, #20
 	sli	v5.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v5.4s
 	add	v2.4s, v2.4s, v6.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v20.16b, v20.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v6.16b, v6.16b, v12.16b
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v5.4s, #25
 	sli	v6.4s, v5.4s, #7
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7

 	ext	v5.16b, v5.16b, v5.16b, #4
 	ext	v6.16b, v6.16b, v6.16b, #4
 	ext	v7.16b, v7.16b, v7.16b, #4

 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v11.16b, v11.16b, v11.16b, #8
 	ext	v12.16b, v12.16b, v12.16b, #8

 	ext	v15.16b, v15.16b, v15.16b, #12
 	ext	v16.16b, v16.16b, v16.16b, #12
 	ext	v17.16b, v17.16b, v17.16b, #12
 	add	v0.4s, v0.4s, v5.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v5.16b, v5.16b, v10.16b
 	eor	v6.16b, v6.16b, v11.16b
 	eor	v7.16b, v7.16b, v12.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	ushr	v5.4s, v6.4s, #20
 	sli	v5.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v5.4s
 	add	v2.4s, v2.4s, v6.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v20.16b, v20.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v6.16b, v6.16b, v12.16b
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v5.4s, #25
 	sli	v6.4s, v5.4s, #7
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7

 	ext	v5.16b, v5.16b, v5.16b, #12
 	ext	v6.16b, v6.16b, v6.16b, #12
 	ext	v7.16b, v7.16b, v7.16b, #12

 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v11.16b, v11.16b, v11.16b, #8
 	ext	v12.16b, v12.16b, v12.16b, #8

 	ext	v15.16b, v15.16b, v15.16b, #4
 	ext	v16.16b, v16.16b, v16.16b, #4
 	ext	v17.16b, v17.16b, v17.16b, #4
 	subs	x6, x6, #1
 	b.hi	Lseal_128_rounds

 	add	v0.4s, v0.4s, v24.4s
 	add	v1.4s, v1.4s, v24.4s
 	add	v2.4s, v2.4s, v24.4s

 	add	v5.4s, v5.4s, v28.4s
 	add	v6.4s, v6.4s, v28.4s
 	add	v7.4s, v7.4s, v28.4s

     // Only the first 32 bytes of the third block (counter = 0) are needed,
     // so skip updating v12 and v17.
 	add	v10.4s, v10.4s, v29.4s
 	add	v11.4s, v11.4s, v29.4s

 	add	v30.4s, v30.4s, v25.4s
 	add	v15.4s, v15.4s, v30.4s
 	add	v30.4s, v30.4s, v25.4s
 	add	v16.4s, v16.4s, v30.4s

 	and	v2.16b, v2.16b, v27.16b
 	mov	x16, v2.d[0] // Move the R key to GPRs
 	mov	x17, v2.d[1]
 	mov	v27.16b, v7.16b // Store the S key

 	bl	Lpoly_hash_ad_internal
 	b	Lseal_tail
 .cfi_endproc


 /////////////////////////////////
 //
 // void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
 //
 .globl	chacha20_poly1305_open

 .def chacha20_poly1305_open
    .type 32
 .endef
 .align	6
 chacha20_poly1305_open:
 	AARCH64_SIGN_LINK_REGISTER
 .cfi_startproc
 	stp	x29, x30, [sp, #-80]!
 .cfi_def_cfa_offset	80
 .cfi_offset	w30, -72
 .cfi_offset	w29, -80
 	mov	x29, sp
     // We probably could do .cfi_def_cfa w29, 80 at this point, but since
     // we don't actually use the frame pointer like that, it's probably not
     // worth bothering.
 	stp	d8, d9, [sp, #16]
 	stp	d10, d11, [sp, #32]
 	stp	d12, d13, [sp, #48]
 	stp	d14, d15, [sp, #64]
 .cfi_offset	b15, -8
 .cfi_offset	b14, -16
 .cfi_offset	b13, -24
 .cfi_offset	b12, -32
 .cfi_offset	b11, -40
 .cfi_offset	b10, -48
 .cfi_offset	b9, -56
 .cfi_offset	b8, -64

 	adrp	x11, Lchacha20_consts
 	add	x11, x11, :lo12:Lchacha20_consts

 	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
 	ld1	{v28.16b - v30.16b}, [x5]

 	mov	x15, #1 // Prepare the Poly1305 state
 	mov	x8, #0
 	mov	x9, #0
 	mov	x10, #0

 	mov	v31.d[0], x4  // Store the input and aad lengths
 	mov	v31.d[1], x2

 	cmp	x2, #128
 	b.le	Lopen_128 // Optimization for smaller buffers

     // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
 	mov	v0.16b, v24.16b
 	mov	v5.16b, v28.16b
 	mov	v10.16b, v29.16b
 	mov	v15.16b, v30.16b

 	mov	x6, #10

 .align	5
 Lopen_init_rounds:
 	add	v0.4s, v0.4s, v5.4s
 	eor	v15.16b, v15.16b, v0.16b
 	rev32	v15.8h, v15.8h

 	add	v10.4s, v10.4s, v15.4s
 	eor	v5.16b, v5.16b, v10.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	add	v0.4s, v0.4s, v20.4s
 	eor	v15.16b, v15.16b, v0.16b
 	tbl	v15.16b, {v15.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	eor	v20.16b, v20.16b, v10.16b
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7
 	ext	v5.16b, v5.16b, v5.16b, #4
 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v15.16b, v15.16b, v15.16b, #12
 	add	v0.4s, v0.4s, v5.4s
 	eor	v15.16b, v15.16b, v0.16b
 	rev32	v15.8h, v15.8h

 	add	v10.4s, v10.4s, v15.4s
 	eor	v5.16b, v5.16b, v10.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	add	v0.4s, v0.4s, v20.4s
 	eor	v15.16b, v15.16b, v0.16b
 	tbl	v15.16b, {v15.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	eor	v20.16b, v20.16b, v10.16b
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7
 	ext	v5.16b, v5.16b, v5.16b, #12
 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v15.16b, v15.16b, v15.16b, #4
 	subs	x6, x6, #1
 	b.hi	Lopen_init_rounds

 	add	v0.4s, v0.4s, v24.4s
 	add	v5.4s, v5.4s, v28.4s

 	and	v0.16b, v0.16b, v27.16b
 	mov	x16, v0.d[0] // Move the R key to GPRs
 	mov	x17, v0.d[1]
 	mov	v27.16b, v5.16b // Store the S key

 	bl	Lpoly_hash_ad_internal

 Lopen_ad_done:
 	mov	x3, x1

 // Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
 Lopen_main_loop:

 	cmp	x2, #192
 	b.lt	Lopen_tail

 	adrp	x11, Lchacha20_consts
 	add	x11, x11, :lo12:Lchacha20_consts

 	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
 	mov	v4.16b, v24.16b

 	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
 	mov	v9.16b, v28.16b

 	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
 	mov	v14.16b, v29.16b

 	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
 	sub	x5, x5, #32
 	add	v15.4s, v15.4s, v25.4s
 	mov	v19.16b, v30.16b

 	eor	v20.16b, v20.16b, v20.16b //zero
 	not	v21.16b, v20.16b // -1
 	sub	v21.4s, v25.4s, v21.4s // Add +1
 	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
 	add	v19.4s, v19.4s, v20.4s

 	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
 	sub	x4, x4, #10

 	mov	x7, #10
 	subs	x6, x7, x4
 	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
 	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full

 	cbz	x7, Lopen_main_loop_rounds_short

 .align	5
 Lopen_main_loop_rounds:
 	ldp	x11, x12, [x3], 16
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 Lopen_main_loop_rounds_short:
 	add	v0.4s, v0.4s, v5.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	add	v3.4s, v3.4s, v8.4s
 	add	v4.4s, v4.4s, v9.4s

 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	eor	v18.16b, v18.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h
 	rev32	v18.8h, v18.8h
 	rev32	v19.8h, v19.8h

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	add	v13.4s, v13.4s, v18.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v5.16b, v5.16b, v10.16b
 	eor	v6.16b, v6.16b, v11.16b
 	eor	v7.16b, v7.16b, v12.16b
 	eor	v8.16b, v8.16b, v13.16b
 	eor	v9.16b, v9.16b, v14.16b

 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	ushr	v5.4s, v6.4s, #20
 	sli	v5.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12
 	ushr	v7.4s, v8.4s, #20
 	sli	v7.4s, v8.4s, #12
 	ushr	v8.4s, v9.4s, #20
 	sli	v8.4s, v9.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v5.4s
 	add	v2.4s, v2.4s, v6.4s
 	add	v3.4s, v3.4s, v7.4s
 	add	v4.4s, v4.4s, v8.4s

 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	eor	v18.16b, v18.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b
 	tbl	v18.16b, {v18.16b}, v26.16b
 	tbl	v19.16b, {v19.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	add	v13.4s, v13.4s, v18.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v20.16b, v20.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v6.16b, v6.16b, v12.16b
 	eor	v7.16b, v7.16b, v13.16b
 	eor	v8.16b, v8.16b, v14.16b

 	ushr	v9.4s, v8.4s, #25
 	sli	v9.4s, v8.4s, #7
 	ushr	v8.4s, v7.4s, #25
 	sli	v8.4s, v7.4s, #7
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v5.4s, #25
 	sli	v6.4s, v5.4s, #7
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7

 	ext	v9.16b, v9.16b, v9.16b, #4
 	ext	v14.16b, v14.16b, v14.16b, #8
 	ext	v19.16b, v19.16b, v19.16b, #12
 	ldp	x11, x12, [x3], 16
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	add	v0.4s, v0.4s, v6.4s
 	add	v1.4s, v1.4s, v7.4s
 	add	v2.4s, v2.4s, v8.4s
 	add	v3.4s, v3.4s, v5.4s
 	add	v4.4s, v4.4s, v9.4s

 	eor	v18.16b, v18.16b, v0.16b
 	eor	v15.16b, v15.16b, v1.16b
 	eor	v16.16b, v16.16b, v2.16b
 	eor	v17.16b, v17.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	rev32	v18.8h, v18.8h
 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h
 	rev32	v19.8h, v19.8h

 	add	v12.4s, v12.4s, v18.4s
 	add	v13.4s, v13.4s, v15.4s
 	add	v10.4s, v10.4s, v16.4s
 	add	v11.4s, v11.4s, v17.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v6.16b, v6.16b, v12.16b
 	eor	v7.16b, v7.16b, v13.16b
 	eor	v8.16b, v8.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v9.16b, v9.16b, v14.16b

 	ushr	v20.4s, v6.4s, #20
 	sli	v20.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12
 	ushr	v7.4s, v8.4s, #20
 	sli	v7.4s, v8.4s, #12
 	ushr	v8.4s, v5.4s, #20
 	sli	v8.4s, v5.4s, #12
 	ushr	v5.4s, v9.4s, #20
 	sli	v5.4s, v9.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	add	v3.4s, v3.4s, v8.4s
 	add	v4.4s, v4.4s, v5.4s

 	eor	v18.16b, v18.16b, v0.16b
 	eor	v15.16b, v15.16b, v1.16b
 	eor	v16.16b, v16.16b, v2.16b
 	eor	v17.16b, v17.16b, v3.16b
 	eor	v19.16b, v19.16b, v4.16b

 	tbl	v18.16b, {v18.16b}, v26.16b
 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b
 	tbl	v19.16b, {v19.16b}, v26.16b

 	add	v12.4s, v12.4s, v18.4s
 	add	v13.4s, v13.4s, v15.4s
 	add	v10.4s, v10.4s, v16.4s
 	add	v11.4s, v11.4s, v17.4s
 	add	v14.4s, v14.4s, v19.4s

 	eor	v20.16b, v20.16b, v12.16b
 	eor	v6.16b, v6.16b, v13.16b
 	eor	v7.16b, v7.16b, v10.16b
 	eor	v8.16b, v8.16b, v11.16b
 	eor	v5.16b, v5.16b, v14.16b

 	ushr	v9.4s, v5.4s, #25
 	sli	v9.4s, v5.4s, #7
 	ushr	v5.4s, v8.4s, #25
 	sli	v5.4s, v8.4s, #7
 	ushr	v8.4s, v7.4s, #25
 	sli	v8.4s, v7.4s, #7
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v20.4s, #25
 	sli	v6.4s, v20.4s, #7

 	ext	v9.16b, v9.16b, v9.16b, #12
 	ext	v14.16b, v14.16b, v14.16b, #8
 	ext	v19.16b, v19.16b, v19.16b, #4
 	subs	x7, x7, #1
 	b.gt	Lopen_main_loop_rounds
 	subs	x6, x6, #1
 	b.ge	Lopen_main_loop_rounds_short

 	eor	v20.16b, v20.16b, v20.16b //zero
 	not	v21.16b, v20.16b // -1
 	sub	v21.4s, v25.4s, v21.4s // Add +1
 	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
 	add	v19.4s, v19.4s, v20.4s

 	add	v15.4s, v15.4s, v25.4s
 	mov	x11, #5
 	dup	v20.4s, w11
 	add	v25.4s, v25.4s, v20.4s

 	zip1	v20.4s, v0.4s, v1.4s
 	zip2	v21.4s, v0.4s, v1.4s
 	zip1	v22.4s, v2.4s, v3.4s
 	zip2	v23.4s, v2.4s, v3.4s

 	zip1	v0.2d, v20.2d, v22.2d
 	zip2	v1.2d, v20.2d, v22.2d
 	zip1	v2.2d, v21.2d, v23.2d
 	zip2	v3.2d, v21.2d, v23.2d

 	zip1	v20.4s, v5.4s, v6.4s
 	zip2	v21.4s, v5.4s, v6.4s
 	zip1	v22.4s, v7.4s, v8.4s
 	zip2	v23.4s, v7.4s, v8.4s

 	zip1	v5.2d, v20.2d, v22.2d
 	zip2	v6.2d, v20.2d, v22.2d
 	zip1	v7.2d, v21.2d, v23.2d
 	zip2	v8.2d, v21.2d, v23.2d

 	zip1	v20.4s, v10.4s, v11.4s
 	zip2	v21.4s, v10.4s, v11.4s
 	zip1	v22.4s, v12.4s, v13.4s
 	zip2	v23.4s, v12.4s, v13.4s

 	zip1	v10.2d, v20.2d, v22.2d
 	zip2	v11.2d, v20.2d, v22.2d
 	zip1	v12.2d, v21.2d, v23.2d
 	zip2	v13.2d, v21.2d, v23.2d

 	zip1	v20.4s, v15.4s, v16.4s
 	zip2	v21.4s, v15.4s, v16.4s
 	zip1	v22.4s, v17.4s, v18.4s
 	zip2	v23.4s, v17.4s, v18.4s

 	zip1	v15.2d, v20.2d, v22.2d
 	zip2	v16.2d, v20.2d, v22.2d
 	zip1	v17.2d, v21.2d, v23.2d
 	zip2	v18.2d, v21.2d, v23.2d

 	add	v0.4s, v0.4s, v24.4s
 	add	v5.4s, v5.4s, v28.4s
 	add	v10.4s, v10.4s, v29.4s
 	add	v15.4s, v15.4s, v30.4s

 	add	v1.4s, v1.4s, v24.4s
 	add	v6.4s, v6.4s, v28.4s
 	add	v11.4s, v11.4s, v29.4s
 	add	v16.4s, v16.4s, v30.4s

 	add	v2.4s, v2.4s, v24.4s
 	add	v7.4s, v7.4s, v28.4s
 	add	v12.4s, v12.4s, v29.4s
 	add	v17.4s, v17.4s, v30.4s

 	add	v3.4s, v3.4s, v24.4s
 	add	v8.4s, v8.4s, v28.4s
 	add	v13.4s, v13.4s, v29.4s
 	add	v18.4s, v18.4s, v30.4s

 	add	v4.4s, v4.4s, v24.4s
 	add	v9.4s, v9.4s, v28.4s
 	add	v14.4s, v14.4s, v29.4s
 	add	v19.4s, v19.4s, v30.4s

     // We can always safely store 192 bytes
 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v0.16b
 	eor	v21.16b, v21.16b, v5.16b
 	eor	v22.16b, v22.16b, v10.16b
 	eor	v23.16b, v23.16b, v15.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v1.16b
 	eor	v21.16b, v21.16b, v6.16b
 	eor	v22.16b, v22.16b, v11.16b
 	eor	v23.16b, v23.16b, v16.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v2.16b
 	eor	v21.16b, v21.16b, v7.16b
 	eor	v22.16b, v22.16b, v12.16b
 	eor	v23.16b, v23.16b, v17.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	sub	x2, x2, #192

 	mov	v0.16b, v3.16b
 	mov	v5.16b, v8.16b
 	mov	v10.16b, v13.16b
 	mov	v15.16b, v18.16b

 	cmp	x2, #64
 	b.lt	Lopen_tail_64_store

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v3.16b
 	eor	v21.16b, v21.16b, v8.16b
 	eor	v22.16b, v22.16b, v13.16b
 	eor	v23.16b, v23.16b, v18.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	sub	x2, x2, #64

 	mov	v0.16b, v4.16b
 	mov	v5.16b, v9.16b
 	mov	v10.16b, v14.16b
 	mov	v15.16b, v19.16b

 	cmp	x2, #64
 	b.lt	Lopen_tail_64_store

 	ld1	{v20.16b - v23.16b}, [x1], #64
 	eor	v20.16b, v20.16b, v4.16b
 	eor	v21.16b, v21.16b, v9.16b
 	eor	v22.16b, v22.16b, v14.16b
 	eor	v23.16b, v23.16b, v19.16b
 	st1	{v20.16b - v23.16b}, [x0], #64

 	sub	x2, x2, #64
 	b	Lopen_main_loop

 Lopen_tail:

 	cbz	x2, Lopen_finalize

 	lsr	x4, x2, #4 // How many whole blocks we have to hash

 	cmp	x2, #64
 	b.le	Lopen_tail_64
 	cmp	x2, #128
 	b.le	Lopen_tail_128

 Lopen_tail_192:
      // We need three more blocks
 	mov	v0.16b, v24.16b
 	mov	v1.16b, v24.16b
 	mov	v2.16b, v24.16b
 	mov	v5.16b, v28.16b
 	mov	v6.16b, v28.16b
 	mov	v7.16b, v28.16b
 	mov	v10.16b, v29.16b
 	mov	v11.16b, v29.16b
 	mov	v12.16b, v29.16b
 	mov	v15.16b, v30.16b
 	mov	v16.16b, v30.16b
 	mov	v17.16b, v30.16b
 	eor	v23.16b, v23.16b, v23.16b
 	eor	v21.16b, v21.16b, v21.16b
 	ins	v23.s[0], v25.s[0]
 	ins	v21.d[0], x15

 	add	v22.4s, v23.4s, v21.4s
 	add	v21.4s, v22.4s, v21.4s

 	add	v15.4s, v15.4s, v21.4s
 	add	v16.4s, v16.4s, v23.4s
 	add	v17.4s, v17.4s, v22.4s

 	mov	x7, #10
 	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
 	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
 	sub	x4, x4, x7

 	cbz	x7, Lopen_tail_192_rounds_no_hash

 Lopen_tail_192_rounds:
 	ldp	x11, x12, [x3], 16
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 Lopen_tail_192_rounds_no_hash:
 	add	v0.4s, v0.4s, v5.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v5.16b, v5.16b, v10.16b
 	eor	v6.16b, v6.16b, v11.16b
 	eor	v7.16b, v7.16b, v12.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	ushr	v5.4s, v6.4s, #20
 	sli	v5.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v5.4s
 	add	v2.4s, v2.4s, v6.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v20.16b, v20.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v6.16b, v6.16b, v12.16b
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v5.4s, #25
 	sli	v6.4s, v5.4s, #7
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7

 	ext	v5.16b, v5.16b, v5.16b, #4
 	ext	v6.16b, v6.16b, v6.16b, #4
 	ext	v7.16b, v7.16b, v7.16b, #4

 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v11.16b, v11.16b, v11.16b, #8
 	ext	v12.16b, v12.16b, v12.16b, #8

 	ext	v15.16b, v15.16b, v15.16b, #12
 	ext	v16.16b, v16.16b, v16.16b, #12
 	ext	v17.16b, v17.16b, v17.16b, #12
 	add	v0.4s, v0.4s, v5.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v5.16b, v5.16b, v10.16b
 	eor	v6.16b, v6.16b, v11.16b
 	eor	v7.16b, v7.16b, v12.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	ushr	v5.4s, v6.4s, #20
 	sli	v5.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v5.4s
 	add	v2.4s, v2.4s, v6.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v20.16b, v20.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v6.16b, v6.16b, v12.16b
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v5.4s, #25
 	sli	v6.4s, v5.4s, #7
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7

 	ext	v5.16b, v5.16b, v5.16b, #12
 	ext	v6.16b, v6.16b, v6.16b, #12
 	ext	v7.16b, v7.16b, v7.16b, #12

 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v11.16b, v11.16b, v11.16b, #8
 	ext	v12.16b, v12.16b, v12.16b, #8

 	ext	v15.16b, v15.16b, v15.16b, #4
 	ext	v16.16b, v16.16b, v16.16b, #4
 	ext	v17.16b, v17.16b, v17.16b, #4
 	subs	x7, x7, #1
 	b.gt	Lopen_tail_192_rounds
 	subs	x6, x6, #1
 	b.ge	Lopen_tail_192_rounds_no_hash

     // We hashed 160 bytes at most, may still have 32 bytes left
 Lopen_tail_192_hash:
 	cbz	x4, Lopen_tail_192_hash_done
 	ldp	x11, x12, [x3], 16
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	sub	x4, x4, #1
 	b	Lopen_tail_192_hash

 Lopen_tail_192_hash_done:

 	add	v0.4s, v0.4s, v24.4s
 	add	v1.4s, v1.4s, v24.4s
 	add	v2.4s, v2.4s, v24.4s
 	add	v5.4s, v5.4s, v28.4s
 	add	v6.4s, v6.4s, v28.4s
 	add	v7.4s, v7.4s, v28.4s
 	add	v10.4s, v10.4s, v29.4s
 	add	v11.4s, v11.4s, v29.4s
 	add	v12.4s, v12.4s, v29.4s
 	add	v15.4s, v15.4s, v30.4s
 	add	v16.4s, v16.4s, v30.4s
 	add	v17.4s, v17.4s, v30.4s

 	add	v15.4s, v15.4s, v21.4s
 	add	v16.4s, v16.4s, v23.4s
 	add	v17.4s, v17.4s, v22.4s

 	ld1	{v20.16b - v23.16b}, [x1], #64

 	eor	v20.16b, v20.16b, v1.16b
 	eor	v21.16b, v21.16b, v6.16b
 	eor	v22.16b, v22.16b, v11.16b
 	eor	v23.16b, v23.16b, v16.16b

 	st1	{v20.16b - v23.16b}, [x0], #64

 	ld1	{v20.16b - v23.16b}, [x1], #64

 	eor	v20.16b, v20.16b, v2.16b
 	eor	v21.16b, v21.16b, v7.16b
 	eor	v22.16b, v22.16b, v12.16b
 	eor	v23.16b, v23.16b, v17.16b

 	st1	{v20.16b - v23.16b}, [x0], #64

 	sub	x2, x2, #128
 	b	Lopen_tail_64_store

 Lopen_tail_128:
      // We need two more blocks
 	mov	v0.16b, v24.16b
 	mov	v1.16b, v24.16b
 	mov	v5.16b, v28.16b
 	mov	v6.16b, v28.16b
 	mov	v10.16b, v29.16b
 	mov	v11.16b, v29.16b
 	mov	v15.16b, v30.16b
 	mov	v16.16b, v30.16b
 	eor	v23.16b, v23.16b, v23.16b
 	eor	v22.16b, v22.16b, v22.16b
 	ins	v23.s[0], v25.s[0]
 	ins	v22.d[0], x15
 	add	v22.4s, v22.4s, v23.4s

 	add	v15.4s, v15.4s, v22.4s
 	add	v16.4s, v16.4s, v23.4s

 	mov	x6, #10
 	sub	x6, x6, x4

 Lopen_tail_128_rounds:
 	add	v0.4s, v0.4s, v5.4s
 	eor	v15.16b, v15.16b, v0.16b
 	rev32	v15.8h, v15.8h

 	add	v10.4s, v10.4s, v15.4s
 	eor	v5.16b, v5.16b, v10.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	add	v0.4s, v0.4s, v20.4s
 	eor	v15.16b, v15.16b, v0.16b
 	tbl	v15.16b, {v15.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	eor	v20.16b, v20.16b, v10.16b
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7
 	ext	v5.16b, v5.16b, v5.16b, #4
 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v15.16b, v15.16b, v15.16b, #12
 	add	v1.4s, v1.4s, v6.4s
 	eor	v16.16b, v16.16b, v1.16b
 	rev32	v16.8h, v16.8h

 	add	v11.4s, v11.4s, v16.4s
 	eor	v6.16b, v6.16b, v11.16b
 	ushr	v20.4s, v6.4s, #20
 	sli	v20.4s, v6.4s, #12
 	add	v1.4s, v1.4s, v20.4s
 	eor	v16.16b, v16.16b, v1.16b
 	tbl	v16.16b, {v16.16b}, v26.16b

 	add	v11.4s, v11.4s, v16.4s
 	eor	v20.16b, v20.16b, v11.16b
 	ushr	v6.4s, v20.4s, #25
 	sli	v6.4s, v20.4s, #7
 	ext	v6.16b, v6.16b, v6.16b, #4
 	ext	v11.16b, v11.16b, v11.16b, #8
 	ext	v16.16b, v16.16b, v16.16b, #12
 	add	v0.4s, v0.4s, v5.4s
 	eor	v15.16b, v15.16b, v0.16b
 	rev32	v15.8h, v15.8h

 	add	v10.4s, v10.4s, v15.4s
 	eor	v5.16b, v5.16b, v10.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	add	v0.4s, v0.4s, v20.4s
 	eor	v15.16b, v15.16b, v0.16b
 	tbl	v15.16b, {v15.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	eor	v20.16b, v20.16b, v10.16b
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7
 	ext	v5.16b, v5.16b, v5.16b, #12
 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v15.16b, v15.16b, v15.16b, #4
 	add	v1.4s, v1.4s, v6.4s
 	eor	v16.16b, v16.16b, v1.16b
 	rev32	v16.8h, v16.8h

 	add	v11.4s, v11.4s, v16.4s
 	eor	v6.16b, v6.16b, v11.16b
 	ushr	v20.4s, v6.4s, #20
 	sli	v20.4s, v6.4s, #12
 	add	v1.4s, v1.4s, v20.4s
 	eor	v16.16b, v16.16b, v1.16b
 	tbl	v16.16b, {v16.16b}, v26.16b

 	add	v11.4s, v11.4s, v16.4s
 	eor	v20.16b, v20.16b, v11.16b
 	ushr	v6.4s, v20.4s, #25
 	sli	v6.4s, v20.4s, #7
 	ext	v6.16b, v6.16b, v6.16b, #12
 	ext	v11.16b, v11.16b, v11.16b, #8
 	ext	v16.16b, v16.16b, v16.16b, #4
 	subs	x6, x6, #1
 	b.gt	Lopen_tail_128_rounds
 	cbz	x4, Lopen_tail_128_rounds_done
 	subs	x4, x4, #1
 	ldp	x11, x12, [x3], 16
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	b	Lopen_tail_128_rounds

 Lopen_tail_128_rounds_done:
 	add	v0.4s, v0.4s, v24.4s
 	add	v1.4s, v1.4s, v24.4s
 	add	v5.4s, v5.4s, v28.4s
 	add	v6.4s, v6.4s, v28.4s
 	add	v10.4s, v10.4s, v29.4s
 	add	v11.4s, v11.4s, v29.4s
 	add	v15.4s, v15.4s, v30.4s
 	add	v16.4s, v16.4s, v30.4s
 	add	v15.4s, v15.4s, v22.4s
 	add	v16.4s, v16.4s, v23.4s

 	ld1	{v20.16b - v23.16b}, [x1], #64

 	eor	v20.16b, v20.16b, v1.16b
 	eor	v21.16b, v21.16b, v6.16b
 	eor	v22.16b, v22.16b, v11.16b
 	eor	v23.16b, v23.16b, v16.16b

 	st1	{v20.16b - v23.16b}, [x0], #64
 	sub	x2, x2, #64

 	b	Lopen_tail_64_store

 Lopen_tail_64:
     // We just need a single block
 	mov	v0.16b, v24.16b
 	mov	v5.16b, v28.16b
 	mov	v10.16b, v29.16b
 	mov	v15.16b, v30.16b
 	eor	v23.16b, v23.16b, v23.16b
 	ins	v23.s[0], v25.s[0]
 	add	v15.4s, v15.4s, v23.4s

 	mov	x6, #10
 	sub	x6, x6, x4

 Lopen_tail_64_rounds:
 	add	v0.4s, v0.4s, v5.4s
 	eor	v15.16b, v15.16b, v0.16b
 	rev32	v15.8h, v15.8h

 	add	v10.4s, v10.4s, v15.4s
 	eor	v5.16b, v5.16b, v10.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	add	v0.4s, v0.4s, v20.4s
 	eor	v15.16b, v15.16b, v0.16b
 	tbl	v15.16b, {v15.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	eor	v20.16b, v20.16b, v10.16b
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7
 	ext	v5.16b, v5.16b, v5.16b, #4
 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v15.16b, v15.16b, v15.16b, #12
 	add	v0.4s, v0.4s, v5.4s
 	eor	v15.16b, v15.16b, v0.16b
 	rev32	v15.8h, v15.8h

 	add	v10.4s, v10.4s, v15.4s
 	eor	v5.16b, v5.16b, v10.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	add	v0.4s, v0.4s, v20.4s
 	eor	v15.16b, v15.16b, v0.16b
 	tbl	v15.16b, {v15.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	eor	v20.16b, v20.16b, v10.16b
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7
 	ext	v5.16b, v5.16b, v5.16b, #12
 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v15.16b, v15.16b, v15.16b, #4
 	subs	x6, x6, #1
 	b.gt	Lopen_tail_64_rounds
 	cbz	x4, Lopen_tail_64_rounds_done
 	subs	x4, x4, #1
 	ldp	x11, x12, [x3], 16
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	b	Lopen_tail_64_rounds

 Lopen_tail_64_rounds_done:
 	add	v0.4s, v0.4s, v24.4s
 	add	v5.4s, v5.4s, v28.4s
 	add	v10.4s, v10.4s, v29.4s
 	add	v15.4s, v15.4s, v30.4s
 	add	v15.4s, v15.4s, v23.4s

 Lopen_tail_64_store:
 	cmp	x2, #16
 	b.lt	Lopen_tail_16

 	ld1	{v20.16b}, [x1], #16
 	eor	v20.16b, v20.16b, v0.16b
 	st1	{v20.16b}, [x0], #16
 	mov	v0.16b, v5.16b
 	mov	v5.16b, v10.16b
 	mov	v10.16b, v15.16b
 	sub	x2, x2, #16
 	b	Lopen_tail_64_store

 Lopen_tail_16:
     // Here we handle the last [0,16) bytes that require a padded block
 	cbz	x2, Lopen_finalize

 	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
 	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
 	not	v22.16b, v20.16b

 	add	x7, x1, x2
 	mov	x6, x2

 Lopen_tail_16_compose:
 	ext	v20.16b, v20.16b, v20.16b, #15
 	ldrb	w11, [x7, #-1]!
 	mov	v20.b[0], w11
 	ext	v21.16b, v22.16b, v21.16b, #15
 	subs	x2, x2, #1
 	b.gt	Lopen_tail_16_compose

 	and	v20.16b, v20.16b, v21.16b
     // Hash in the final padded block
 	mov	x11, v20.d[0]
 	mov	x12, v20.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	eor	v20.16b, v20.16b, v0.16b

 Lopen_tail_16_store:
 	umov	w11, v20.b[0]
 	strb	w11, [x0], #1
 	ext	v20.16b, v20.16b, v20.16b, #1
 	subs	x6, x6, #1
 	b.gt	Lopen_tail_16_store

 Lopen_finalize:
 	mov	x11, v31.d[0]
 	mov	x12, v31.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
     // Final reduction step
 	sub	x12, xzr, x15
 	orr	x13, xzr, #3
 	subs	x11, x8, #-5
 	sbcs	x12, x9, x12
 	sbcs	x13, x10, x13
 	csel	x8, x11, x8, cs
 	csel	x9, x12, x9, cs
 	csel	x10, x13, x10, cs
 	mov	x11, v27.d[0]
 	mov	x12, v27.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15

 	stp	x8, x9, [x5]

 	ldp	d8, d9, [sp, #16]
 	ldp	d10, d11, [sp, #32]
 	ldp	d12, d13, [sp, #48]
 	ldp	d14, d15, [sp, #64]
 .cfi_restore	b15
 .cfi_restore	b14
 .cfi_restore	b13
 .cfi_restore	b12
 .cfi_restore	b11
 .cfi_restore	b10
 .cfi_restore	b9
 .cfi_restore	b8
 	ldp	x29, x30, [sp], 80
 .cfi_restore	w29
 .cfi_restore	w30
 .cfi_def_cfa_offset	0
 	AARCH64_VALIDATE_LINK_REGISTER
 	ret

 Lopen_128:
     // On some architectures preparing 5 blocks for small buffers is wasteful
 	eor	v25.16b, v25.16b, v25.16b
 	mov	x11, #1
 	mov	v25.s[0], w11
 	mov	v0.16b, v24.16b
 	mov	v1.16b, v24.16b
 	mov	v2.16b, v24.16b
 	mov	v5.16b, v28.16b
 	mov	v6.16b, v28.16b
 	mov	v7.16b, v28.16b
 	mov	v10.16b, v29.16b
 	mov	v11.16b, v29.16b
 	mov	v12.16b, v29.16b
 	mov	v17.16b, v30.16b
 	add	v15.4s, v17.4s, v25.4s
 	add	v16.4s, v15.4s, v25.4s

 	mov	x6, #10

 Lopen_128_rounds:
 	add	v0.4s, v0.4s, v5.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v5.16b, v5.16b, v10.16b
 	eor	v6.16b, v6.16b, v11.16b
 	eor	v7.16b, v7.16b, v12.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	ushr	v5.4s, v6.4s, #20
 	sli	v5.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v5.4s
 	add	v2.4s, v2.4s, v6.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v20.16b, v20.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v6.16b, v6.16b, v12.16b
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v5.4s, #25
 	sli	v6.4s, v5.4s, #7
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7

 	ext	v5.16b, v5.16b, v5.16b, #4
 	ext	v6.16b, v6.16b, v6.16b, #4
 	ext	v7.16b, v7.16b, v7.16b, #4

 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v11.16b, v11.16b, v11.16b, #8
 	ext	v12.16b, v12.16b, v12.16b, #8

 	ext	v15.16b, v15.16b, v15.16b, #12
 	ext	v16.16b, v16.16b, v16.16b, #12
 	ext	v17.16b, v17.16b, v17.16b, #12
 	add	v0.4s, v0.4s, v5.4s
 	add	v1.4s, v1.4s, v6.4s
 	add	v2.4s, v2.4s, v7.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	rev32	v15.8h, v15.8h
 	rev32	v16.8h, v16.8h
 	rev32	v17.8h, v17.8h

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v5.16b, v5.16b, v10.16b
 	eor	v6.16b, v6.16b, v11.16b
 	eor	v7.16b, v7.16b, v12.16b
 	ushr	v20.4s, v5.4s, #20
 	sli	v20.4s, v5.4s, #12
 	ushr	v5.4s, v6.4s, #20
 	sli	v5.4s, v6.4s, #12
 	ushr	v6.4s, v7.4s, #20
 	sli	v6.4s, v7.4s, #12

 	add	v0.4s, v0.4s, v20.4s
 	add	v1.4s, v1.4s, v5.4s
 	add	v2.4s, v2.4s, v6.4s
 	eor	v15.16b, v15.16b, v0.16b
 	eor	v16.16b, v16.16b, v1.16b
 	eor	v17.16b, v17.16b, v2.16b
 	tbl	v15.16b, {v15.16b}, v26.16b
 	tbl	v16.16b, {v16.16b}, v26.16b
 	tbl	v17.16b, {v17.16b}, v26.16b

 	add	v10.4s, v10.4s, v15.4s
 	add	v11.4s, v11.4s, v16.4s
 	add	v12.4s, v12.4s, v17.4s
 	eor	v20.16b, v20.16b, v10.16b
 	eor	v5.16b, v5.16b, v11.16b
 	eor	v6.16b, v6.16b, v12.16b
 	ushr	v7.4s, v6.4s, #25
 	sli	v7.4s, v6.4s, #7
 	ushr	v6.4s, v5.4s, #25
 	sli	v6.4s, v5.4s, #7
 	ushr	v5.4s, v20.4s, #25
 	sli	v5.4s, v20.4s, #7

 	ext	v5.16b, v5.16b, v5.16b, #12
 	ext	v6.16b, v6.16b, v6.16b, #12
 	ext	v7.16b, v7.16b, v7.16b, #12

 	ext	v10.16b, v10.16b, v10.16b, #8
 	ext	v11.16b, v11.16b, v11.16b, #8
 	ext	v12.16b, v12.16b, v12.16b, #8

 	ext	v15.16b, v15.16b, v15.16b, #4
 	ext	v16.16b, v16.16b, v16.16b, #4
 	ext	v17.16b, v17.16b, v17.16b, #4
 	subs	x6, x6, #1
 	b.hi	Lopen_128_rounds

 	add	v0.4s, v0.4s, v24.4s
 	add	v1.4s, v1.4s, v24.4s
 	add	v2.4s, v2.4s, v24.4s

 	add	v5.4s, v5.4s, v28.4s
 	add	v6.4s, v6.4s, v28.4s
 	add	v7.4s, v7.4s, v28.4s

 	add	v10.4s, v10.4s, v29.4s
 	add	v11.4s, v11.4s, v29.4s

 	add	v30.4s, v30.4s, v25.4s
 	add	v15.4s, v15.4s, v30.4s
 	add	v30.4s, v30.4s, v25.4s
 	add	v16.4s, v16.4s, v30.4s

 	and	v2.16b, v2.16b, v27.16b
 	mov	x16, v2.d[0] // Move the R key to GPRs
 	mov	x17, v2.d[1]
 	mov	v27.16b, v7.16b // Store the S key

 	bl	Lpoly_hash_ad_internal

 Lopen_128_store:
 	cmp	x2, #64
 	b.lt	Lopen_128_store_64

 	ld1	{v20.16b - v23.16b}, [x1], #64

 	mov	x11, v20.d[0]
 	mov	x12, v20.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	mov	x11, v21.d[0]
 	mov	x12, v21.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	mov	x11, v22.d[0]
 	mov	x12, v22.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	mov	x11, v23.d[0]
 	mov	x12, v23.d[1]
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most

 	eor	v20.16b, v20.16b, v0.16b
 	eor	v21.16b, v21.16b, v5.16b
 	eor	v22.16b, v22.16b, v10.16b
 	eor	v23.16b, v23.16b, v15.16b

 	st1	{v20.16b - v23.16b}, [x0], #64

 	sub	x2, x2, #64

 	mov	v0.16b, v1.16b
 	mov	v5.16b, v6.16b
 	mov	v10.16b, v11.16b
 	mov	v15.16b, v16.16b

 Lopen_128_store_64:

 	lsr	x4, x2, #4
 	mov	x3, x1

 Lopen_128_hash_64:
 	cbz	x4, Lopen_tail_64_store
 	ldp	x11, x12, [x3], 16
 	adds	x8, x8, x11
 	adcs	x9, x9, x12
 	adc	x10, x10, x15
 	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
 	umulh	x12, x8, x16
 	mul	x13, x9, x16
 	umulh	x14, x9, x16
 	adds	x12, x12, x13
 	mul	x13, x10, x16
 	adc	x13, x13, x14
 	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
 	umulh	x8, x8, x17
 	adds	x12, x12, x14
 	mul	x14, x9, x17
 	umulh	x9, x9, x17
 	adcs	x14, x14, x8
 	mul	x10, x10, x17
 	adc	x10, x10, x9
 	adds	x13, x13, x14
 	adc	x14, x10, xzr
 	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
 	and	x8, x13, #-4
 	extr	x13, x14, x13, #2
 	adds	x8, x8, x11
 	lsr	x11, x14, #2
 	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
 	adds	x8, x8, x13
 	adcs	x9, x9, x12
 	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
 	sub	x4, x4, #1
 	b	Lopen_128_hash_64
 .cfi_endproc

 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)