|  | // This file is generated from a similarly-named Perl script in the BoringSSL | 
|  | // source tree. Do not edit by hand. | 
|  |  | 
|  | #include <openssl/asm_base.h> | 
|  |  | 
|  | #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) | 
|  | #include <openssl/arm_arch.h> | 
|  | .section	.rodata | 
|  |  | 
|  | .align	7 | 
|  | Lchacha20_consts: | 
|  | .byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' | 
|  | Linc: | 
|  | .long	1,2,3,4 | 
|  | Lrol8: | 
|  | .byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 | 
|  | Lclamp: | 
|  | .quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC | 
|  |  | 
|  | .text | 
|  |  | 
|  | .def Lpoly_hash_ad_internal | 
|  | .type 32 | 
|  | .endef | 
|  | .align	6 | 
|  | Lpoly_hash_ad_internal: | 
|  | .cfi_startproc | 
|  | cbnz	x4, Lpoly_hash_intro | 
|  | ret | 
|  |  | 
|  | Lpoly_hash_intro: | 
|  | cmp	x4, #16 | 
|  | b.lt	Lpoly_hash_ad_tail | 
|  | ldp	x11, x12, [x3], 16 | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | sub	x4, x4, #16 | 
|  | b	Lpoly_hash_ad_internal | 
|  |  | 
|  | Lpoly_hash_ad_tail: | 
|  | cbz	x4, Lpoly_hash_ad_ret | 
|  |  | 
|  | eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD | 
|  | sub	x4, x4, #1 | 
|  |  | 
|  | Lpoly_hash_tail_16_compose: | 
|  | ext	v20.16b, v20.16b, v20.16b, #15 | 
|  | ldrb	w11, [x3, x4] | 
|  | mov	v20.b[0], w11 | 
|  | subs	x4, x4, #1 | 
|  | b.ge	Lpoly_hash_tail_16_compose | 
|  | mov	x11, v20.d[0] | 
|  | mov	x12, v20.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  |  | 
|  | Lpoly_hash_ad_ret: | 
|  | ret | 
|  | .cfi_endproc | 
|  |  | 
|  |  | 
|  | ///////////////////////////////// | 
|  | // | 
|  | // void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); | 
|  | // | 
|  | .globl	chacha20_poly1305_seal | 
|  |  | 
|  | .def chacha20_poly1305_seal | 
|  | .type 32 | 
|  | .endef | 
|  | .align	6 | 
|  | chacha20_poly1305_seal: | 
|  | AARCH64_SIGN_LINK_REGISTER | 
|  | .cfi_startproc | 
|  | stp	x29, x30, [sp, #-80]! | 
|  | .cfi_def_cfa_offset	80 | 
|  | .cfi_offset	w30, -72 | 
|  | .cfi_offset	w29, -80 | 
|  | mov	x29, sp | 
|  | // We probably could do .cfi_def_cfa w29, 80 at this point, but since | 
|  | // we don't actually use the frame pointer like that, it's probably not | 
|  | // worth bothering. | 
|  | stp	d8, d9, [sp, #16] | 
|  | stp	d10, d11, [sp, #32] | 
|  | stp	d12, d13, [sp, #48] | 
|  | stp	d14, d15, [sp, #64] | 
|  | .cfi_offset	b15, -8 | 
|  | .cfi_offset	b14, -16 | 
|  | .cfi_offset	b13, -24 | 
|  | .cfi_offset	b12, -32 | 
|  | .cfi_offset	b11, -40 | 
|  | .cfi_offset	b10, -48 | 
|  | .cfi_offset	b9, -56 | 
|  | .cfi_offset	b8, -64 | 
|  |  | 
|  | adrp	x11, Lchacha20_consts | 
|  | add	x11, x11, :lo12:Lchacha20_consts | 
|  |  | 
|  | ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values | 
|  | ld1	{v28.16b - v30.16b}, [x5] | 
|  |  | 
|  | mov	x15, #1 // Prepare the Poly1305 state | 
|  | mov	x8, #0 | 
|  | mov	x9, #0 | 
|  | mov	x10, #0 | 
|  |  | 
|  | ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len | 
|  | add	x12, x12, x2 | 
|  | mov	v31.d[0], x4  // Store the input and aad lengths | 
|  | mov	v31.d[1], x12 | 
|  |  | 
|  | cmp	x2, #128 | 
|  | b.le	Lseal_128 // Optimization for smaller buffers | 
|  |  | 
|  | // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, | 
|  | // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, | 
|  | // the fifth block (A4-D4) horizontally. | 
|  | ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11] | 
|  | mov	v4.16b, v24.16b | 
|  |  | 
|  | ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 | 
|  | mov	v9.16b, v28.16b | 
|  |  | 
|  | ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 | 
|  | mov	v14.16b, v29.16b | 
|  |  | 
|  | ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5] | 
|  | add	v15.4s, v15.4s, v25.4s | 
|  | mov	v19.16b, v30.16b | 
|  |  | 
|  | sub	x5, x5, #32 | 
|  |  | 
|  | mov	x6, #10 | 
|  |  | 
|  | .align	5 | 
|  | Lseal_init_rounds: | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | add	v3.4s, v3.4s, v8.4s | 
|  | add	v4.4s, v4.4s, v9.4s | 
|  |  | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | eor	v18.16b, v18.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  | rev32	v18.8h, v18.8h | 
|  | rev32	v19.8h, v19.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | add	v13.4s, v13.4s, v18.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | eor	v6.16b, v6.16b, v11.16b | 
|  | eor	v7.16b, v7.16b, v12.16b | 
|  | eor	v8.16b, v8.16b, v13.16b | 
|  | eor	v9.16b, v9.16b, v14.16b | 
|  |  | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v6.4s, #20 | 
|  | sli	v5.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  | ushr	v7.4s, v8.4s, #20 | 
|  | sli	v7.4s, v8.4s, #12 | 
|  | ushr	v8.4s, v9.4s, #20 | 
|  | sli	v8.4s, v9.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v5.4s | 
|  | add	v2.4s, v2.4s, v6.4s | 
|  | add	v3.4s, v3.4s, v7.4s | 
|  | add	v4.4s, v4.4s, v8.4s | 
|  |  | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | eor	v18.16b, v18.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  | tbl	v18.16b, {v18.16b}, v26.16b | 
|  | tbl	v19.16b, {v19.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | add	v13.4s, v13.4s, v18.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | eor	v7.16b, v7.16b, v13.16b | 
|  | eor	v8.16b, v8.16b, v14.16b | 
|  |  | 
|  | ushr	v9.4s, v8.4s, #25 | 
|  | sli	v9.4s, v8.4s, #7 | 
|  | ushr	v8.4s, v7.4s, #25 | 
|  | sli	v8.4s, v7.4s, #7 | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v5.4s, #25 | 
|  | sli	v6.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v9.16b, v9.16b, v9.16b, #4 | 
|  | ext	v14.16b, v14.16b, v14.16b, #8 | 
|  | ext	v19.16b, v19.16b, v19.16b, #12 | 
|  | add	v0.4s, v0.4s, v6.4s | 
|  | add	v1.4s, v1.4s, v7.4s | 
|  | add	v2.4s, v2.4s, v8.4s | 
|  | add	v3.4s, v3.4s, v5.4s | 
|  | add	v4.4s, v4.4s, v9.4s | 
|  |  | 
|  | eor	v18.16b, v18.16b, v0.16b | 
|  | eor	v15.16b, v15.16b, v1.16b | 
|  | eor	v16.16b, v16.16b, v2.16b | 
|  | eor	v17.16b, v17.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | rev32	v18.8h, v18.8h | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  | rev32	v19.8h, v19.8h | 
|  |  | 
|  | add	v12.4s, v12.4s, v18.4s | 
|  | add	v13.4s, v13.4s, v15.4s | 
|  | add	v10.4s, v10.4s, v16.4s | 
|  | add	v11.4s, v11.4s, v17.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | eor	v7.16b, v7.16b, v13.16b | 
|  | eor	v8.16b, v8.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v9.16b, v9.16b, v14.16b | 
|  |  | 
|  | ushr	v20.4s, v6.4s, #20 | 
|  | sli	v20.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  | ushr	v7.4s, v8.4s, #20 | 
|  | sli	v7.4s, v8.4s, #12 | 
|  | ushr	v8.4s, v5.4s, #20 | 
|  | sli	v8.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v9.4s, #20 | 
|  | sli	v5.4s, v9.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | add	v3.4s, v3.4s, v8.4s | 
|  | add	v4.4s, v4.4s, v5.4s | 
|  |  | 
|  | eor	v18.16b, v18.16b, v0.16b | 
|  | eor	v15.16b, v15.16b, v1.16b | 
|  | eor	v16.16b, v16.16b, v2.16b | 
|  | eor	v17.16b, v17.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | tbl	v18.16b, {v18.16b}, v26.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  | tbl	v19.16b, {v19.16b}, v26.16b | 
|  |  | 
|  | add	v12.4s, v12.4s, v18.4s | 
|  | add	v13.4s, v13.4s, v15.4s | 
|  | add	v10.4s, v10.4s, v16.4s | 
|  | add	v11.4s, v11.4s, v17.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v20.16b, v20.16b, v12.16b | 
|  | eor	v6.16b, v6.16b, v13.16b | 
|  | eor	v7.16b, v7.16b, v10.16b | 
|  | eor	v8.16b, v8.16b, v11.16b | 
|  | eor	v5.16b, v5.16b, v14.16b | 
|  |  | 
|  | ushr	v9.4s, v5.4s, #25 | 
|  | sli	v9.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v8.4s, #25 | 
|  | sli	v5.4s, v8.4s, #7 | 
|  | ushr	v8.4s, v7.4s, #25 | 
|  | sli	v8.4s, v7.4s, #7 | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v20.4s, #25 | 
|  | sli	v6.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v9.16b, v9.16b, v9.16b, #12 | 
|  | ext	v14.16b, v14.16b, v14.16b, #8 | 
|  | ext	v19.16b, v19.16b, v19.16b, #4 | 
|  | subs	x6, x6, #1 | 
|  | b.hi	Lseal_init_rounds | 
|  |  | 
|  | add	v15.4s, v15.4s, v25.4s | 
|  | mov	x11, #4 | 
|  | dup	v20.4s, w11 | 
|  | add	v25.4s, v25.4s, v20.4s | 
|  |  | 
|  | zip1	v20.4s, v0.4s, v1.4s | 
|  | zip2	v21.4s, v0.4s, v1.4s | 
|  | zip1	v22.4s, v2.4s, v3.4s | 
|  | zip2	v23.4s, v2.4s, v3.4s | 
|  |  | 
|  | zip1	v0.2d, v20.2d, v22.2d | 
|  | zip2	v1.2d, v20.2d, v22.2d | 
|  | zip1	v2.2d, v21.2d, v23.2d | 
|  | zip2	v3.2d, v21.2d, v23.2d | 
|  |  | 
|  | zip1	v20.4s, v5.4s, v6.4s | 
|  | zip2	v21.4s, v5.4s, v6.4s | 
|  | zip1	v22.4s, v7.4s, v8.4s | 
|  | zip2	v23.4s, v7.4s, v8.4s | 
|  |  | 
|  | zip1	v5.2d, v20.2d, v22.2d | 
|  | zip2	v6.2d, v20.2d, v22.2d | 
|  | zip1	v7.2d, v21.2d, v23.2d | 
|  | zip2	v8.2d, v21.2d, v23.2d | 
|  |  | 
|  | zip1	v20.4s, v10.4s, v11.4s | 
|  | zip2	v21.4s, v10.4s, v11.4s | 
|  | zip1	v22.4s, v12.4s, v13.4s | 
|  | zip2	v23.4s, v12.4s, v13.4s | 
|  |  | 
|  | zip1	v10.2d, v20.2d, v22.2d | 
|  | zip2	v11.2d, v20.2d, v22.2d | 
|  | zip1	v12.2d, v21.2d, v23.2d | 
|  | zip2	v13.2d, v21.2d, v23.2d | 
|  |  | 
|  | zip1	v20.4s, v15.4s, v16.4s | 
|  | zip2	v21.4s, v15.4s, v16.4s | 
|  | zip1	v22.4s, v17.4s, v18.4s | 
|  | zip2	v23.4s, v17.4s, v18.4s | 
|  |  | 
|  | zip1	v15.2d, v20.2d, v22.2d | 
|  | zip2	v16.2d, v20.2d, v22.2d | 
|  | zip1	v17.2d, v21.2d, v23.2d | 
|  | zip2	v18.2d, v21.2d, v23.2d | 
|  |  | 
|  | add	v4.4s, v4.4s, v24.4s | 
|  | add	v9.4s, v9.4s, v28.4s | 
|  | and	v4.16b, v4.16b, v27.16b | 
|  |  | 
|  | add	v0.4s, v0.4s, v24.4s | 
|  | add	v5.4s, v5.4s, v28.4s | 
|  | add	v10.4s, v10.4s, v29.4s | 
|  | add	v15.4s, v15.4s, v30.4s | 
|  |  | 
|  | add	v1.4s, v1.4s, v24.4s | 
|  | add	v6.4s, v6.4s, v28.4s | 
|  | add	v11.4s, v11.4s, v29.4s | 
|  | add	v16.4s, v16.4s, v30.4s | 
|  |  | 
|  | add	v2.4s, v2.4s, v24.4s | 
|  | add	v7.4s, v7.4s, v28.4s | 
|  | add	v12.4s, v12.4s, v29.4s | 
|  | add	v17.4s, v17.4s, v30.4s | 
|  |  | 
|  | add	v3.4s, v3.4s, v24.4s | 
|  | add	v8.4s, v8.4s, v28.4s | 
|  | add	v13.4s, v13.4s, v29.4s | 
|  | add	v18.4s, v18.4s, v30.4s | 
|  |  | 
|  | mov	x16, v4.d[0] // Move the R key to GPRs | 
|  | mov	x17, v4.d[1] | 
|  | mov	v27.16b, v9.16b // Store the S key | 
|  |  | 
|  | bl	Lpoly_hash_ad_internal | 
|  |  | 
|  | mov	x3, x0 | 
|  | cmp	x2, #256 | 
|  | b.le	Lseal_tail | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v0.16b | 
|  | eor	v21.16b, v21.16b, v5.16b | 
|  | eor	v22.16b, v22.16b, v10.16b | 
|  | eor	v23.16b, v23.16b, v15.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v1.16b | 
|  | eor	v21.16b, v21.16b, v6.16b | 
|  | eor	v22.16b, v22.16b, v11.16b | 
|  | eor	v23.16b, v23.16b, v16.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v2.16b | 
|  | eor	v21.16b, v21.16b, v7.16b | 
|  | eor	v22.16b, v22.16b, v12.16b | 
|  | eor	v23.16b, v23.16b, v17.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v3.16b | 
|  | eor	v21.16b, v21.16b, v8.16b | 
|  | eor	v22.16b, v22.16b, v13.16b | 
|  | eor	v23.16b, v23.16b, v18.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | sub	x2, x2, #256 | 
|  |  | 
|  | mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds | 
|  | mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 | 
|  |  | 
|  | Lseal_main_loop: | 
|  | adrp	x11, Lchacha20_consts | 
|  | add	x11, x11, :lo12:Lchacha20_consts | 
|  |  | 
|  | ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11] | 
|  | mov	v4.16b, v24.16b | 
|  |  | 
|  | ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 | 
|  | mov	v9.16b, v28.16b | 
|  |  | 
|  | ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 | 
|  | mov	v14.16b, v29.16b | 
|  |  | 
|  | ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5] | 
|  | add	v15.4s, v15.4s, v25.4s | 
|  | mov	v19.16b, v30.16b | 
|  |  | 
|  | eor	v20.16b, v20.16b, v20.16b //zero | 
|  | not	v21.16b, v20.16b // -1 | 
|  | sub	v21.4s, v25.4s, v21.4s // Add +1 | 
|  | ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) | 
|  | add	v19.4s, v19.4s, v20.4s | 
|  |  | 
|  | sub	x5, x5, #32 | 
|  | .align	5 | 
|  | Lseal_main_loop_rounds: | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | add	v3.4s, v3.4s, v8.4s | 
|  | add	v4.4s, v4.4s, v9.4s | 
|  |  | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | eor	v18.16b, v18.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  | rev32	v18.8h, v18.8h | 
|  | rev32	v19.8h, v19.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | add	v13.4s, v13.4s, v18.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | eor	v6.16b, v6.16b, v11.16b | 
|  | eor	v7.16b, v7.16b, v12.16b | 
|  | eor	v8.16b, v8.16b, v13.16b | 
|  | eor	v9.16b, v9.16b, v14.16b | 
|  |  | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v6.4s, #20 | 
|  | sli	v5.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  | ushr	v7.4s, v8.4s, #20 | 
|  | sli	v7.4s, v8.4s, #12 | 
|  | ushr	v8.4s, v9.4s, #20 | 
|  | sli	v8.4s, v9.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v5.4s | 
|  | add	v2.4s, v2.4s, v6.4s | 
|  | add	v3.4s, v3.4s, v7.4s | 
|  | add	v4.4s, v4.4s, v8.4s | 
|  |  | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | eor	v18.16b, v18.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  | tbl	v18.16b, {v18.16b}, v26.16b | 
|  | tbl	v19.16b, {v19.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | add	v13.4s, v13.4s, v18.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | eor	v7.16b, v7.16b, v13.16b | 
|  | eor	v8.16b, v8.16b, v14.16b | 
|  |  | 
|  | ushr	v9.4s, v8.4s, #25 | 
|  | sli	v9.4s, v8.4s, #7 | 
|  | ushr	v8.4s, v7.4s, #25 | 
|  | sli	v8.4s, v7.4s, #7 | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v5.4s, #25 | 
|  | sli	v6.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v9.16b, v9.16b, v9.16b, #4 | 
|  | ext	v14.16b, v14.16b, v14.16b, #8 | 
|  | ext	v19.16b, v19.16b, v19.16b, #12 | 
|  | ldp	x11, x12, [x3], 16 | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | add	v0.4s, v0.4s, v6.4s | 
|  | add	v1.4s, v1.4s, v7.4s | 
|  | add	v2.4s, v2.4s, v8.4s | 
|  | add	v3.4s, v3.4s, v5.4s | 
|  | add	v4.4s, v4.4s, v9.4s | 
|  |  | 
|  | eor	v18.16b, v18.16b, v0.16b | 
|  | eor	v15.16b, v15.16b, v1.16b | 
|  | eor	v16.16b, v16.16b, v2.16b | 
|  | eor	v17.16b, v17.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | rev32	v18.8h, v18.8h | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  | rev32	v19.8h, v19.8h | 
|  |  | 
|  | add	v12.4s, v12.4s, v18.4s | 
|  | add	v13.4s, v13.4s, v15.4s | 
|  | add	v10.4s, v10.4s, v16.4s | 
|  | add	v11.4s, v11.4s, v17.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | eor	v7.16b, v7.16b, v13.16b | 
|  | eor	v8.16b, v8.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v9.16b, v9.16b, v14.16b | 
|  |  | 
|  | ushr	v20.4s, v6.4s, #20 | 
|  | sli	v20.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  | ushr	v7.4s, v8.4s, #20 | 
|  | sli	v7.4s, v8.4s, #12 | 
|  | ushr	v8.4s, v5.4s, #20 | 
|  | sli	v8.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v9.4s, #20 | 
|  | sli	v5.4s, v9.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | add	v3.4s, v3.4s, v8.4s | 
|  | add	v4.4s, v4.4s, v5.4s | 
|  |  | 
|  | eor	v18.16b, v18.16b, v0.16b | 
|  | eor	v15.16b, v15.16b, v1.16b | 
|  | eor	v16.16b, v16.16b, v2.16b | 
|  | eor	v17.16b, v17.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | tbl	v18.16b, {v18.16b}, v26.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  | tbl	v19.16b, {v19.16b}, v26.16b | 
|  |  | 
|  | add	v12.4s, v12.4s, v18.4s | 
|  | add	v13.4s, v13.4s, v15.4s | 
|  | add	v10.4s, v10.4s, v16.4s | 
|  | add	v11.4s, v11.4s, v17.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v20.16b, v20.16b, v12.16b | 
|  | eor	v6.16b, v6.16b, v13.16b | 
|  | eor	v7.16b, v7.16b, v10.16b | 
|  | eor	v8.16b, v8.16b, v11.16b | 
|  | eor	v5.16b, v5.16b, v14.16b | 
|  |  | 
|  | ushr	v9.4s, v5.4s, #25 | 
|  | sli	v9.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v8.4s, #25 | 
|  | sli	v5.4s, v8.4s, #7 | 
|  | ushr	v8.4s, v7.4s, #25 | 
|  | sli	v8.4s, v7.4s, #7 | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v20.4s, #25 | 
|  | sli	v6.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v9.16b, v9.16b, v9.16b, #12 | 
|  | ext	v14.16b, v14.16b, v14.16b, #8 | 
|  | ext	v19.16b, v19.16b, v19.16b, #4 | 
|  | subs	x6, x6, #1 | 
|  | b.ge	Lseal_main_loop_rounds | 
|  | ldp	x11, x12, [x3], 16 | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | subs	x7, x7, #1 | 
|  | b.gt	Lseal_main_loop_rounds | 
|  |  | 
|  | eor	v20.16b, v20.16b, v20.16b //zero | 
|  | not	v21.16b, v20.16b // -1 | 
|  | sub	v21.4s, v25.4s, v21.4s // Add +1 | 
|  | ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) | 
|  | add	v19.4s, v19.4s, v20.4s | 
|  |  | 
|  | add	v15.4s, v15.4s, v25.4s | 
|  | mov	x11, #5 | 
|  | dup	v20.4s, w11 | 
|  | add	v25.4s, v25.4s, v20.4s | 
|  |  | 
|  | zip1	v20.4s, v0.4s, v1.4s | 
|  | zip2	v21.4s, v0.4s, v1.4s | 
|  | zip1	v22.4s, v2.4s, v3.4s | 
|  | zip2	v23.4s, v2.4s, v3.4s | 
|  |  | 
|  | zip1	v0.2d, v20.2d, v22.2d | 
|  | zip2	v1.2d, v20.2d, v22.2d | 
|  | zip1	v2.2d, v21.2d, v23.2d | 
|  | zip2	v3.2d, v21.2d, v23.2d | 
|  |  | 
|  | zip1	v20.4s, v5.4s, v6.4s | 
|  | zip2	v21.4s, v5.4s, v6.4s | 
|  | zip1	v22.4s, v7.4s, v8.4s | 
|  | zip2	v23.4s, v7.4s, v8.4s | 
|  |  | 
|  | zip1	v5.2d, v20.2d, v22.2d | 
|  | zip2	v6.2d, v20.2d, v22.2d | 
|  | zip1	v7.2d, v21.2d, v23.2d | 
|  | zip2	v8.2d, v21.2d, v23.2d | 
|  |  | 
|  | zip1	v20.4s, v10.4s, v11.4s | 
|  | zip2	v21.4s, v10.4s, v11.4s | 
|  | zip1	v22.4s, v12.4s, v13.4s | 
|  | zip2	v23.4s, v12.4s, v13.4s | 
|  |  | 
|  | zip1	v10.2d, v20.2d, v22.2d | 
|  | zip2	v11.2d, v20.2d, v22.2d | 
|  | zip1	v12.2d, v21.2d, v23.2d | 
|  | zip2	v13.2d, v21.2d, v23.2d | 
|  |  | 
|  | zip1	v20.4s, v15.4s, v16.4s | 
|  | zip2	v21.4s, v15.4s, v16.4s | 
|  | zip1	v22.4s, v17.4s, v18.4s | 
|  | zip2	v23.4s, v17.4s, v18.4s | 
|  |  | 
|  | zip1	v15.2d, v20.2d, v22.2d | 
|  | zip2	v16.2d, v20.2d, v22.2d | 
|  | zip1	v17.2d, v21.2d, v23.2d | 
|  | zip2	v18.2d, v21.2d, v23.2d | 
|  |  | 
|  | add	v0.4s, v0.4s, v24.4s | 
|  | add	v5.4s, v5.4s, v28.4s | 
|  | add	v10.4s, v10.4s, v29.4s | 
|  | add	v15.4s, v15.4s, v30.4s | 
|  |  | 
|  | add	v1.4s, v1.4s, v24.4s | 
|  | add	v6.4s, v6.4s, v28.4s | 
|  | add	v11.4s, v11.4s, v29.4s | 
|  | add	v16.4s, v16.4s, v30.4s | 
|  |  | 
|  | add	v2.4s, v2.4s, v24.4s | 
|  | add	v7.4s, v7.4s, v28.4s | 
|  | add	v12.4s, v12.4s, v29.4s | 
|  | add	v17.4s, v17.4s, v30.4s | 
|  |  | 
|  | add	v3.4s, v3.4s, v24.4s | 
|  | add	v8.4s, v8.4s, v28.4s | 
|  | add	v13.4s, v13.4s, v29.4s | 
|  | add	v18.4s, v18.4s, v30.4s | 
|  |  | 
|  | add	v4.4s, v4.4s, v24.4s | 
|  | add	v9.4s, v9.4s, v28.4s | 
|  | add	v14.4s, v14.4s, v29.4s | 
|  | add	v19.4s, v19.4s, v30.4s | 
|  |  | 
|  | cmp	x2, #320 | 
|  | b.le	Lseal_tail | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v0.16b | 
|  | eor	v21.16b, v21.16b, v5.16b | 
|  | eor	v22.16b, v22.16b, v10.16b | 
|  | eor	v23.16b, v23.16b, v15.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v1.16b | 
|  | eor	v21.16b, v21.16b, v6.16b | 
|  | eor	v22.16b, v22.16b, v11.16b | 
|  | eor	v23.16b, v23.16b, v16.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v2.16b | 
|  | eor	v21.16b, v21.16b, v7.16b | 
|  | eor	v22.16b, v22.16b, v12.16b | 
|  | eor	v23.16b, v23.16b, v17.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v3.16b | 
|  | eor	v21.16b, v21.16b, v8.16b | 
|  | eor	v22.16b, v22.16b, v13.16b | 
|  | eor	v23.16b, v23.16b, v18.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v4.16b | 
|  | eor	v21.16b, v21.16b, v9.16b | 
|  | eor	v22.16b, v22.16b, v14.16b | 
|  | eor	v23.16b, v23.16b, v19.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | sub	x2, x2, #320 | 
|  |  | 
|  | mov	x6, #0 | 
|  | mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration | 
|  |  | 
|  | b	Lseal_main_loop | 
|  |  | 
|  | Lseal_tail: | 
|  | // This part of the function handles the storage and authentication of the last [0,320) bytes | 
|  | // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. | 
|  | cmp	x2, #64 | 
|  | b.lt	Lseal_tail_64 | 
|  |  | 
|  | // Store and authenticate 64B blocks per iteration | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  |  | 
|  | eor	v20.16b, v20.16b, v0.16b | 
|  | eor	v21.16b, v21.16b, v5.16b | 
|  | eor	v22.16b, v22.16b, v10.16b | 
|  | eor	v23.16b, v23.16b, v15.16b | 
|  | mov	x11, v20.d[0] | 
|  | mov	x12, v20.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | mov	x11, v21.d[0] | 
|  | mov	x12, v21.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | mov	x11, v22.d[0] | 
|  | mov	x12, v22.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | mov	x11, v23.d[0] | 
|  | mov	x12, v23.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  | sub	x2, x2, #64 | 
|  |  | 
|  | // Shift the state left by 64 bytes for the next iteration of the loop | 
|  | mov	v0.16b, v1.16b | 
|  | mov	v5.16b, v6.16b | 
|  | mov	v10.16b, v11.16b | 
|  | mov	v15.16b, v16.16b | 
|  |  | 
|  | mov	v1.16b, v2.16b | 
|  | mov	v6.16b, v7.16b | 
|  | mov	v11.16b, v12.16b | 
|  | mov	v16.16b, v17.16b | 
|  |  | 
|  | mov	v2.16b, v3.16b | 
|  | mov	v7.16b, v8.16b | 
|  | mov	v12.16b, v13.16b | 
|  | mov	v17.16b, v18.16b | 
|  |  | 
|  | mov	v3.16b, v4.16b | 
|  | mov	v8.16b, v9.16b | 
|  | mov	v13.16b, v14.16b | 
|  | mov	v18.16b, v19.16b | 
|  |  | 
|  | b	Lseal_tail | 
|  |  | 
|  | Lseal_tail_64: | 
|  | ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr | 
|  |  | 
|  | // Here we handle the last [0,64) bytes of plaintext | 
|  | cmp	x2, #16 | 
|  | b.lt	Lseal_tail_16 | 
|  | // Each iteration encrypt and authenticate a 16B block | 
|  | ld1	{v20.16b}, [x1], #16 | 
|  | eor	v20.16b, v20.16b, v0.16b | 
|  | mov	x11, v20.d[0] | 
|  | mov	x12, v20.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | st1	{v20.16b}, [x0], #16 | 
|  |  | 
|  | sub	x2, x2, #16 | 
|  |  | 
|  | // Shift the state left by 16 bytes for the next iteration of the loop | 
|  | mov	v0.16b, v5.16b | 
|  | mov	v5.16b, v10.16b | 
|  | mov	v10.16b, v15.16b | 
|  |  | 
|  | b	Lseal_tail_64 | 
|  |  | 
|  | Lseal_tail_16: | 
|  | // Here we handle the last [0,16) bytes of ciphertext that require a padded block | 
|  | cbz	x2, Lseal_hash_extra | 
|  |  | 
|  | eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in | 
|  | eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes | 
|  | not	v22.16b, v20.16b | 
|  |  | 
|  | mov	x6, x2 | 
|  | add	x1, x1, x2 | 
|  |  | 
|  | cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding | 
|  |  | 
|  | mov	x7, #16          // We need to load some extra_in first for padding | 
|  | sub	x7, x7, x2 | 
|  | cmp	x4, x7 | 
|  | csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register | 
|  | mov	x12, x7 | 
|  | add	x3, x3, x7 | 
|  | sub	x4, x4, x7 | 
|  |  | 
|  | Lseal_tail16_compose_extra_in: | 
|  | ext	v20.16b, v20.16b, v20.16b, #15 | 
|  | ldrb	w11, [x3, #-1]! | 
|  | mov	v20.b[0], w11 | 
|  | subs	x7, x7, #1 | 
|  | b.gt	Lseal_tail16_compose_extra_in | 
|  |  | 
|  | add	x3, x3, x12 | 
|  |  | 
|  | Lseal_tail_16_compose: | 
|  | ext	v20.16b, v20.16b, v20.16b, #15 | 
|  | ldrb	w11, [x1, #-1]! | 
|  | mov	v20.b[0], w11 | 
|  | ext	v21.16b, v22.16b, v21.16b, #15 | 
|  | subs	x2, x2, #1 | 
|  | b.gt	Lseal_tail_16_compose | 
|  |  | 
|  | and	v0.16b, v0.16b, v21.16b | 
|  | eor	v20.16b, v20.16b, v0.16b | 
|  | mov	v21.16b, v20.16b | 
|  |  | 
|  | Lseal_tail_16_store: | 
|  | umov	w11, v20.b[0] | 
|  | strb	w11, [x0], #1 | 
|  | ext	v20.16b, v20.16b, v20.16b, #1 | 
|  | subs	x6, x6, #1 | 
|  | b.gt	Lseal_tail_16_store | 
|  |  | 
|  | // Hash in the final ct block concatenated with extra_in | 
|  | mov	x11, v21.d[0] | 
|  | mov	x12, v21.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  |  | 
|  | Lseal_hash_extra: | 
|  | cbz	x4, Lseal_finalize | 
|  |  | 
|  | Lseal_hash_extra_loop: | 
|  | cmp	x4, #16 | 
|  | b.lt	Lseal_hash_extra_tail | 
|  | ld1	{v20.16b}, [x3], #16 | 
|  | mov	x11, v20.d[0] | 
|  | mov	x12, v20.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | sub	x4, x4, #16 | 
|  | b	Lseal_hash_extra_loop | 
|  |  | 
|  | Lseal_hash_extra_tail: | 
|  | cbz	x4, Lseal_finalize | 
|  | eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext | 
|  | add	x3, x3, x4 | 
|  |  | 
|  | Lseal_hash_extra_load: | 
|  | ext	v20.16b, v20.16b, v20.16b, #15 | 
|  | ldrb	w11, [x3, #-1]! | 
|  | mov	v20.b[0], w11 | 
|  | subs	x4, x4, #1 | 
|  | b.gt	Lseal_hash_extra_load | 
|  |  | 
|  | // Hash in the final padded extra_in blcok | 
|  | mov	x11, v20.d[0] | 
|  | mov	x12, v20.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  |  | 
|  | Lseal_finalize: | 
|  | mov	x11, v31.d[0] | 
|  | mov	x12, v31.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | // Final reduction step | 
|  | sub	x12, xzr, x15 | 
|  | orr	x13, xzr, #3 | 
|  | subs	x11, x8, #-5 | 
|  | sbcs	x12, x9, x12 | 
|  | sbcs	x13, x10, x13 | 
|  | csel	x8, x11, x8, cs | 
|  | csel	x9, x12, x9, cs | 
|  | csel	x10, x13, x10, cs | 
|  | mov	x11, v27.d[0] | 
|  | mov	x12, v27.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  |  | 
|  | stp	x8, x9, [x5] | 
|  |  | 
|  | ldp	d8, d9, [sp, #16] | 
|  | ldp	d10, d11, [sp, #32] | 
|  | ldp	d12, d13, [sp, #48] | 
|  | ldp	d14, d15, [sp, #64] | 
|  | .cfi_restore	b15 | 
|  | .cfi_restore	b14 | 
|  | .cfi_restore	b13 | 
|  | .cfi_restore	b12 | 
|  | .cfi_restore	b11 | 
|  | .cfi_restore	b10 | 
|  | .cfi_restore	b9 | 
|  | .cfi_restore	b8 | 
|  | ldp	x29, x30, [sp], 80 | 
|  | .cfi_restore	w29 | 
|  | .cfi_restore	w30 | 
|  | .cfi_def_cfa_offset	0 | 
|  | AARCH64_VALIDATE_LINK_REGISTER | 
|  | ret | 
|  |  | 
|  | Lseal_128: | 
|  | // On some architectures preparing 5 blocks for small buffers is wasteful | 
|  | eor	v25.16b, v25.16b, v25.16b | 
|  | mov	x11, #1 | 
|  | mov	v25.s[0], w11 | 
|  | mov	v0.16b, v24.16b | 
|  | mov	v1.16b, v24.16b | 
|  | mov	v2.16b, v24.16b | 
|  | mov	v5.16b, v28.16b | 
|  | mov	v6.16b, v28.16b | 
|  | mov	v7.16b, v28.16b | 
|  | mov	v10.16b, v29.16b | 
|  | mov	v11.16b, v29.16b | 
|  | mov	v12.16b, v29.16b | 
|  | mov	v17.16b, v30.16b | 
|  | add	v15.4s, v17.4s, v25.4s | 
|  | add	v16.4s, v15.4s, v25.4s | 
|  |  | 
|  | mov	x6, #10 | 
|  |  | 
|  | Lseal_128_rounds: | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | eor	v6.16b, v6.16b, v11.16b | 
|  | eor	v7.16b, v7.16b, v12.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v6.4s, #20 | 
|  | sli	v5.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v5.4s | 
|  | add	v2.4s, v2.4s, v6.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v5.4s, #25 | 
|  | sli	v6.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v5.16b, v5.16b, v5.16b, #4 | 
|  | ext	v6.16b, v6.16b, v6.16b, #4 | 
|  | ext	v7.16b, v7.16b, v7.16b, #4 | 
|  |  | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v11.16b, v11.16b, v11.16b, #8 | 
|  | ext	v12.16b, v12.16b, v12.16b, #8 | 
|  |  | 
|  | ext	v15.16b, v15.16b, v15.16b, #12 | 
|  | ext	v16.16b, v16.16b, v16.16b, #12 | 
|  | ext	v17.16b, v17.16b, v17.16b, #12 | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | eor	v6.16b, v6.16b, v11.16b | 
|  | eor	v7.16b, v7.16b, v12.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v6.4s, #20 | 
|  | sli	v5.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v5.4s | 
|  | add	v2.4s, v2.4s, v6.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v5.4s, #25 | 
|  | sli	v6.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v5.16b, v5.16b, v5.16b, #12 | 
|  | ext	v6.16b, v6.16b, v6.16b, #12 | 
|  | ext	v7.16b, v7.16b, v7.16b, #12 | 
|  |  | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v11.16b, v11.16b, v11.16b, #8 | 
|  | ext	v12.16b, v12.16b, v12.16b, #8 | 
|  |  | 
|  | ext	v15.16b, v15.16b, v15.16b, #4 | 
|  | ext	v16.16b, v16.16b, v16.16b, #4 | 
|  | ext	v17.16b, v17.16b, v17.16b, #4 | 
|  | subs	x6, x6, #1 | 
|  | b.hi	Lseal_128_rounds | 
|  |  | 
|  | add	v0.4s, v0.4s, v24.4s | 
|  | add	v1.4s, v1.4s, v24.4s | 
|  | add	v2.4s, v2.4s, v24.4s | 
|  |  | 
|  | add	v5.4s, v5.4s, v28.4s | 
|  | add	v6.4s, v6.4s, v28.4s | 
|  | add	v7.4s, v7.4s, v28.4s | 
|  |  | 
|  | // Only the first 32 bytes of the third block (counter = 0) are needed, | 
|  | // so skip updating v12 and v17. | 
|  | add	v10.4s, v10.4s, v29.4s | 
|  | add	v11.4s, v11.4s, v29.4s | 
|  |  | 
|  | add	v30.4s, v30.4s, v25.4s | 
|  | add	v15.4s, v15.4s, v30.4s | 
|  | add	v30.4s, v30.4s, v25.4s | 
|  | add	v16.4s, v16.4s, v30.4s | 
|  |  | 
|  | and	v2.16b, v2.16b, v27.16b | 
|  | mov	x16, v2.d[0] // Move the R key to GPRs | 
|  | mov	x17, v2.d[1] | 
|  | mov	v27.16b, v7.16b // Store the S key | 
|  |  | 
|  | bl	Lpoly_hash_ad_internal | 
|  | b	Lseal_tail | 
|  | .cfi_endproc | 
|  |  | 
|  |  | 
|  | ///////////////////////////////// | 
|  | // | 
|  | // void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); | 
|  | // | 
|  | .globl	chacha20_poly1305_open | 
|  |  | 
|  | .def chacha20_poly1305_open | 
|  | .type 32 | 
|  | .endef | 
|  | .align	6 | 
|  | chacha20_poly1305_open: | 
|  | AARCH64_SIGN_LINK_REGISTER | 
|  | .cfi_startproc | 
|  | stp	x29, x30, [sp, #-80]! | 
|  | .cfi_def_cfa_offset	80 | 
|  | .cfi_offset	w30, -72 | 
|  | .cfi_offset	w29, -80 | 
|  | mov	x29, sp | 
|  | // We probably could do .cfi_def_cfa w29, 80 at this point, but since | 
|  | // we don't actually use the frame pointer like that, it's probably not | 
|  | // worth bothering. | 
|  | stp	d8, d9, [sp, #16] | 
|  | stp	d10, d11, [sp, #32] | 
|  | stp	d12, d13, [sp, #48] | 
|  | stp	d14, d15, [sp, #64] | 
|  | .cfi_offset	b15, -8 | 
|  | .cfi_offset	b14, -16 | 
|  | .cfi_offset	b13, -24 | 
|  | .cfi_offset	b12, -32 | 
|  | .cfi_offset	b11, -40 | 
|  | .cfi_offset	b10, -48 | 
|  | .cfi_offset	b9, -56 | 
|  | .cfi_offset	b8, -64 | 
|  |  | 
|  | adrp	x11, Lchacha20_consts | 
|  | add	x11, x11, :lo12:Lchacha20_consts | 
|  |  | 
|  | ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values | 
|  | ld1	{v28.16b - v30.16b}, [x5] | 
|  |  | 
|  | mov	x15, #1 // Prepare the Poly1305 state | 
|  | mov	x8, #0 | 
|  | mov	x9, #0 | 
|  | mov	x10, #0 | 
|  |  | 
|  | mov	v31.d[0], x4  // Store the input and aad lengths | 
|  | mov	v31.d[1], x2 | 
|  |  | 
|  | cmp	x2, #128 | 
|  | b.le	Lopen_128 // Optimization for smaller buffers | 
|  |  | 
|  | // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys | 
|  | mov	v0.16b, v24.16b | 
|  | mov	v5.16b, v28.16b | 
|  | mov	v10.16b, v29.16b | 
|  | mov	v15.16b, v30.16b | 
|  |  | 
|  | mov	x6, #10 | 
|  |  | 
|  | .align	5 | 
|  | Lopen_init_rounds: | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | rev32	v15.8h, v15.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  | ext	v5.16b, v5.16b, v5.16b, #4 | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v15.16b, v15.16b, v15.16b, #12 | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | rev32	v15.8h, v15.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  | ext	v5.16b, v5.16b, v5.16b, #12 | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v15.16b, v15.16b, v15.16b, #4 | 
|  | subs	x6, x6, #1 | 
|  | b.hi	Lopen_init_rounds | 
|  |  | 
|  | add	v0.4s, v0.4s, v24.4s | 
|  | add	v5.4s, v5.4s, v28.4s | 
|  |  | 
|  | and	v0.16b, v0.16b, v27.16b | 
|  | mov	x16, v0.d[0] // Move the R key to GPRs | 
|  | mov	x17, v0.d[1] | 
|  | mov	v27.16b, v5.16b // Store the S key | 
|  |  | 
|  | bl	Lpoly_hash_ad_internal | 
|  |  | 
|  | Lopen_ad_done: | 
|  | mov	x3, x1 | 
|  |  | 
|  | // Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes | 
|  | Lopen_main_loop: | 
|  |  | 
|  | cmp	x2, #192 | 
|  | b.lt	Lopen_tail | 
|  |  | 
|  | adrp	x11, Lchacha20_consts | 
|  | add	x11, x11, :lo12:Lchacha20_consts | 
|  |  | 
|  | ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11] | 
|  | mov	v4.16b, v24.16b | 
|  |  | 
|  | ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 | 
|  | mov	v9.16b, v28.16b | 
|  |  | 
|  | ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 | 
|  | mov	v14.16b, v29.16b | 
|  |  | 
|  | ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5] | 
|  | sub	x5, x5, #32 | 
|  | add	v15.4s, v15.4s, v25.4s | 
|  | mov	v19.16b, v30.16b | 
|  |  | 
|  | eor	v20.16b, v20.16b, v20.16b //zero | 
|  | not	v21.16b, v20.16b // -1 | 
|  | sub	v21.4s, v25.4s, v21.4s // Add +1 | 
|  | ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) | 
|  | add	v19.4s, v19.4s, v20.4s | 
|  |  | 
|  | lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 | 
|  | sub	x4, x4, #10 | 
|  |  | 
|  | mov	x7, #10 | 
|  | subs	x6, x7, x4 | 
|  | subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash | 
|  | csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full | 
|  |  | 
|  | cbz	x7, Lopen_main_loop_rounds_short | 
|  |  | 
|  | .align	5 | 
|  | Lopen_main_loop_rounds: | 
|  | ldp	x11, x12, [x3], 16 | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | Lopen_main_loop_rounds_short: | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | add	v3.4s, v3.4s, v8.4s | 
|  | add	v4.4s, v4.4s, v9.4s | 
|  |  | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | eor	v18.16b, v18.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  | rev32	v18.8h, v18.8h | 
|  | rev32	v19.8h, v19.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | add	v13.4s, v13.4s, v18.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | eor	v6.16b, v6.16b, v11.16b | 
|  | eor	v7.16b, v7.16b, v12.16b | 
|  | eor	v8.16b, v8.16b, v13.16b | 
|  | eor	v9.16b, v9.16b, v14.16b | 
|  |  | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v6.4s, #20 | 
|  | sli	v5.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  | ushr	v7.4s, v8.4s, #20 | 
|  | sli	v7.4s, v8.4s, #12 | 
|  | ushr	v8.4s, v9.4s, #20 | 
|  | sli	v8.4s, v9.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v5.4s | 
|  | add	v2.4s, v2.4s, v6.4s | 
|  | add	v3.4s, v3.4s, v7.4s | 
|  | add	v4.4s, v4.4s, v8.4s | 
|  |  | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | eor	v18.16b, v18.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  | tbl	v18.16b, {v18.16b}, v26.16b | 
|  | tbl	v19.16b, {v19.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | add	v13.4s, v13.4s, v18.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | eor	v7.16b, v7.16b, v13.16b | 
|  | eor	v8.16b, v8.16b, v14.16b | 
|  |  | 
|  | ushr	v9.4s, v8.4s, #25 | 
|  | sli	v9.4s, v8.4s, #7 | 
|  | ushr	v8.4s, v7.4s, #25 | 
|  | sli	v8.4s, v7.4s, #7 | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v5.4s, #25 | 
|  | sli	v6.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v9.16b, v9.16b, v9.16b, #4 | 
|  | ext	v14.16b, v14.16b, v14.16b, #8 | 
|  | ext	v19.16b, v19.16b, v19.16b, #12 | 
|  | ldp	x11, x12, [x3], 16 | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | add	v0.4s, v0.4s, v6.4s | 
|  | add	v1.4s, v1.4s, v7.4s | 
|  | add	v2.4s, v2.4s, v8.4s | 
|  | add	v3.4s, v3.4s, v5.4s | 
|  | add	v4.4s, v4.4s, v9.4s | 
|  |  | 
|  | eor	v18.16b, v18.16b, v0.16b | 
|  | eor	v15.16b, v15.16b, v1.16b | 
|  | eor	v16.16b, v16.16b, v2.16b | 
|  | eor	v17.16b, v17.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | rev32	v18.8h, v18.8h | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  | rev32	v19.8h, v19.8h | 
|  |  | 
|  | add	v12.4s, v12.4s, v18.4s | 
|  | add	v13.4s, v13.4s, v15.4s | 
|  | add	v10.4s, v10.4s, v16.4s | 
|  | add	v11.4s, v11.4s, v17.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | eor	v7.16b, v7.16b, v13.16b | 
|  | eor	v8.16b, v8.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v9.16b, v9.16b, v14.16b | 
|  |  | 
|  | ushr	v20.4s, v6.4s, #20 | 
|  | sli	v20.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  | ushr	v7.4s, v8.4s, #20 | 
|  | sli	v7.4s, v8.4s, #12 | 
|  | ushr	v8.4s, v5.4s, #20 | 
|  | sli	v8.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v9.4s, #20 | 
|  | sli	v5.4s, v9.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | add	v3.4s, v3.4s, v8.4s | 
|  | add	v4.4s, v4.4s, v5.4s | 
|  |  | 
|  | eor	v18.16b, v18.16b, v0.16b | 
|  | eor	v15.16b, v15.16b, v1.16b | 
|  | eor	v16.16b, v16.16b, v2.16b | 
|  | eor	v17.16b, v17.16b, v3.16b | 
|  | eor	v19.16b, v19.16b, v4.16b | 
|  |  | 
|  | tbl	v18.16b, {v18.16b}, v26.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  | tbl	v19.16b, {v19.16b}, v26.16b | 
|  |  | 
|  | add	v12.4s, v12.4s, v18.4s | 
|  | add	v13.4s, v13.4s, v15.4s | 
|  | add	v10.4s, v10.4s, v16.4s | 
|  | add	v11.4s, v11.4s, v17.4s | 
|  | add	v14.4s, v14.4s, v19.4s | 
|  |  | 
|  | eor	v20.16b, v20.16b, v12.16b | 
|  | eor	v6.16b, v6.16b, v13.16b | 
|  | eor	v7.16b, v7.16b, v10.16b | 
|  | eor	v8.16b, v8.16b, v11.16b | 
|  | eor	v5.16b, v5.16b, v14.16b | 
|  |  | 
|  | ushr	v9.4s, v5.4s, #25 | 
|  | sli	v9.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v8.4s, #25 | 
|  | sli	v5.4s, v8.4s, #7 | 
|  | ushr	v8.4s, v7.4s, #25 | 
|  | sli	v8.4s, v7.4s, #7 | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v20.4s, #25 | 
|  | sli	v6.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v9.16b, v9.16b, v9.16b, #12 | 
|  | ext	v14.16b, v14.16b, v14.16b, #8 | 
|  | ext	v19.16b, v19.16b, v19.16b, #4 | 
|  | subs	x7, x7, #1 | 
|  | b.gt	Lopen_main_loop_rounds | 
|  | subs	x6, x6, #1 | 
|  | b.ge	Lopen_main_loop_rounds_short | 
|  |  | 
|  | eor	v20.16b, v20.16b, v20.16b //zero | 
|  | not	v21.16b, v20.16b // -1 | 
|  | sub	v21.4s, v25.4s, v21.4s // Add +1 | 
|  | ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) | 
|  | add	v19.4s, v19.4s, v20.4s | 
|  |  | 
|  | add	v15.4s, v15.4s, v25.4s | 
|  | mov	x11, #5 | 
|  | dup	v20.4s, w11 | 
|  | add	v25.4s, v25.4s, v20.4s | 
|  |  | 
|  | zip1	v20.4s, v0.4s, v1.4s | 
|  | zip2	v21.4s, v0.4s, v1.4s | 
|  | zip1	v22.4s, v2.4s, v3.4s | 
|  | zip2	v23.4s, v2.4s, v3.4s | 
|  |  | 
|  | zip1	v0.2d, v20.2d, v22.2d | 
|  | zip2	v1.2d, v20.2d, v22.2d | 
|  | zip1	v2.2d, v21.2d, v23.2d | 
|  | zip2	v3.2d, v21.2d, v23.2d | 
|  |  | 
|  | zip1	v20.4s, v5.4s, v6.4s | 
|  | zip2	v21.4s, v5.4s, v6.4s | 
|  | zip1	v22.4s, v7.4s, v8.4s | 
|  | zip2	v23.4s, v7.4s, v8.4s | 
|  |  | 
|  | zip1	v5.2d, v20.2d, v22.2d | 
|  | zip2	v6.2d, v20.2d, v22.2d | 
|  | zip1	v7.2d, v21.2d, v23.2d | 
|  | zip2	v8.2d, v21.2d, v23.2d | 
|  |  | 
|  | zip1	v20.4s, v10.4s, v11.4s | 
|  | zip2	v21.4s, v10.4s, v11.4s | 
|  | zip1	v22.4s, v12.4s, v13.4s | 
|  | zip2	v23.4s, v12.4s, v13.4s | 
|  |  | 
|  | zip1	v10.2d, v20.2d, v22.2d | 
|  | zip2	v11.2d, v20.2d, v22.2d | 
|  | zip1	v12.2d, v21.2d, v23.2d | 
|  | zip2	v13.2d, v21.2d, v23.2d | 
|  |  | 
|  | zip1	v20.4s, v15.4s, v16.4s | 
|  | zip2	v21.4s, v15.4s, v16.4s | 
|  | zip1	v22.4s, v17.4s, v18.4s | 
|  | zip2	v23.4s, v17.4s, v18.4s | 
|  |  | 
|  | zip1	v15.2d, v20.2d, v22.2d | 
|  | zip2	v16.2d, v20.2d, v22.2d | 
|  | zip1	v17.2d, v21.2d, v23.2d | 
|  | zip2	v18.2d, v21.2d, v23.2d | 
|  |  | 
|  | add	v0.4s, v0.4s, v24.4s | 
|  | add	v5.4s, v5.4s, v28.4s | 
|  | add	v10.4s, v10.4s, v29.4s | 
|  | add	v15.4s, v15.4s, v30.4s | 
|  |  | 
|  | add	v1.4s, v1.4s, v24.4s | 
|  | add	v6.4s, v6.4s, v28.4s | 
|  | add	v11.4s, v11.4s, v29.4s | 
|  | add	v16.4s, v16.4s, v30.4s | 
|  |  | 
|  | add	v2.4s, v2.4s, v24.4s | 
|  | add	v7.4s, v7.4s, v28.4s | 
|  | add	v12.4s, v12.4s, v29.4s | 
|  | add	v17.4s, v17.4s, v30.4s | 
|  |  | 
|  | add	v3.4s, v3.4s, v24.4s | 
|  | add	v8.4s, v8.4s, v28.4s | 
|  | add	v13.4s, v13.4s, v29.4s | 
|  | add	v18.4s, v18.4s, v30.4s | 
|  |  | 
|  | add	v4.4s, v4.4s, v24.4s | 
|  | add	v9.4s, v9.4s, v28.4s | 
|  | add	v14.4s, v14.4s, v29.4s | 
|  | add	v19.4s, v19.4s, v30.4s | 
|  |  | 
|  | // We can always safely store 192 bytes | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v0.16b | 
|  | eor	v21.16b, v21.16b, v5.16b | 
|  | eor	v22.16b, v22.16b, v10.16b | 
|  | eor	v23.16b, v23.16b, v15.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v1.16b | 
|  | eor	v21.16b, v21.16b, v6.16b | 
|  | eor	v22.16b, v22.16b, v11.16b | 
|  | eor	v23.16b, v23.16b, v16.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v2.16b | 
|  | eor	v21.16b, v21.16b, v7.16b | 
|  | eor	v22.16b, v22.16b, v12.16b | 
|  | eor	v23.16b, v23.16b, v17.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | sub	x2, x2, #192 | 
|  |  | 
|  | mov	v0.16b, v3.16b | 
|  | mov	v5.16b, v8.16b | 
|  | mov	v10.16b, v13.16b | 
|  | mov	v15.16b, v18.16b | 
|  |  | 
|  | cmp	x2, #64 | 
|  | b.lt	Lopen_tail_64_store | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v3.16b | 
|  | eor	v21.16b, v21.16b, v8.16b | 
|  | eor	v22.16b, v22.16b, v13.16b | 
|  | eor	v23.16b, v23.16b, v18.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | sub	x2, x2, #64 | 
|  |  | 
|  | mov	v0.16b, v4.16b | 
|  | mov	v5.16b, v9.16b | 
|  | mov	v10.16b, v14.16b | 
|  | mov	v15.16b, v19.16b | 
|  |  | 
|  | cmp	x2, #64 | 
|  | b.lt	Lopen_tail_64_store | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  | eor	v20.16b, v20.16b, v4.16b | 
|  | eor	v21.16b, v21.16b, v9.16b | 
|  | eor	v22.16b, v22.16b, v14.16b | 
|  | eor	v23.16b, v23.16b, v19.16b | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | sub	x2, x2, #64 | 
|  | b	Lopen_main_loop | 
|  |  | 
|  | Lopen_tail: | 
|  |  | 
|  | cbz	x2, Lopen_finalize | 
|  |  | 
|  | lsr	x4, x2, #4 // How many whole blocks we have to hash | 
|  |  | 
|  | cmp	x2, #64 | 
|  | b.le	Lopen_tail_64 | 
|  | cmp	x2, #128 | 
|  | b.le	Lopen_tail_128 | 
|  |  | 
|  | Lopen_tail_192: | 
|  | // We need three more blocks | 
|  | mov	v0.16b, v24.16b | 
|  | mov	v1.16b, v24.16b | 
|  | mov	v2.16b, v24.16b | 
|  | mov	v5.16b, v28.16b | 
|  | mov	v6.16b, v28.16b | 
|  | mov	v7.16b, v28.16b | 
|  | mov	v10.16b, v29.16b | 
|  | mov	v11.16b, v29.16b | 
|  | mov	v12.16b, v29.16b | 
|  | mov	v15.16b, v30.16b | 
|  | mov	v16.16b, v30.16b | 
|  | mov	v17.16b, v30.16b | 
|  | eor	v23.16b, v23.16b, v23.16b | 
|  | eor	v21.16b, v21.16b, v21.16b | 
|  | ins	v23.s[0], v25.s[0] | 
|  | ins	v21.d[0], x15 | 
|  |  | 
|  | add	v22.4s, v23.4s, v21.4s | 
|  | add	v21.4s, v22.4s, v21.4s | 
|  |  | 
|  | add	v15.4s, v15.4s, v21.4s | 
|  | add	v16.4s, v16.4s, v23.4s | 
|  | add	v17.4s, v17.4s, v22.4s | 
|  |  | 
|  | mov	x7, #10 | 
|  | subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash | 
|  | csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing | 
|  | sub	x4, x4, x7 | 
|  |  | 
|  | cbz	x7, Lopen_tail_192_rounds_no_hash | 
|  |  | 
|  | Lopen_tail_192_rounds: | 
|  | ldp	x11, x12, [x3], 16 | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | Lopen_tail_192_rounds_no_hash: | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | eor	v6.16b, v6.16b, v11.16b | 
|  | eor	v7.16b, v7.16b, v12.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v6.4s, #20 | 
|  | sli	v5.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v5.4s | 
|  | add	v2.4s, v2.4s, v6.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v5.4s, #25 | 
|  | sli	v6.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v5.16b, v5.16b, v5.16b, #4 | 
|  | ext	v6.16b, v6.16b, v6.16b, #4 | 
|  | ext	v7.16b, v7.16b, v7.16b, #4 | 
|  |  | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v11.16b, v11.16b, v11.16b, #8 | 
|  | ext	v12.16b, v12.16b, v12.16b, #8 | 
|  |  | 
|  | ext	v15.16b, v15.16b, v15.16b, #12 | 
|  | ext	v16.16b, v16.16b, v16.16b, #12 | 
|  | ext	v17.16b, v17.16b, v17.16b, #12 | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | eor	v6.16b, v6.16b, v11.16b | 
|  | eor	v7.16b, v7.16b, v12.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v6.4s, #20 | 
|  | sli	v5.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v5.4s | 
|  | add	v2.4s, v2.4s, v6.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v5.4s, #25 | 
|  | sli	v6.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v5.16b, v5.16b, v5.16b, #12 | 
|  | ext	v6.16b, v6.16b, v6.16b, #12 | 
|  | ext	v7.16b, v7.16b, v7.16b, #12 | 
|  |  | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v11.16b, v11.16b, v11.16b, #8 | 
|  | ext	v12.16b, v12.16b, v12.16b, #8 | 
|  |  | 
|  | ext	v15.16b, v15.16b, v15.16b, #4 | 
|  | ext	v16.16b, v16.16b, v16.16b, #4 | 
|  | ext	v17.16b, v17.16b, v17.16b, #4 | 
|  | subs	x7, x7, #1 | 
|  | b.gt	Lopen_tail_192_rounds | 
|  | subs	x6, x6, #1 | 
|  | b.ge	Lopen_tail_192_rounds_no_hash | 
|  |  | 
|  | // We hashed 160 bytes at most, may still have 32 bytes left | 
|  | Lopen_tail_192_hash: | 
|  | cbz	x4, Lopen_tail_192_hash_done | 
|  | ldp	x11, x12, [x3], 16 | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | sub	x4, x4, #1 | 
|  | b	Lopen_tail_192_hash | 
|  |  | 
|  | Lopen_tail_192_hash_done: | 
|  |  | 
|  | add	v0.4s, v0.4s, v24.4s | 
|  | add	v1.4s, v1.4s, v24.4s | 
|  | add	v2.4s, v2.4s, v24.4s | 
|  | add	v5.4s, v5.4s, v28.4s | 
|  | add	v6.4s, v6.4s, v28.4s | 
|  | add	v7.4s, v7.4s, v28.4s | 
|  | add	v10.4s, v10.4s, v29.4s | 
|  | add	v11.4s, v11.4s, v29.4s | 
|  | add	v12.4s, v12.4s, v29.4s | 
|  | add	v15.4s, v15.4s, v30.4s | 
|  | add	v16.4s, v16.4s, v30.4s | 
|  | add	v17.4s, v17.4s, v30.4s | 
|  |  | 
|  | add	v15.4s, v15.4s, v21.4s | 
|  | add	v16.4s, v16.4s, v23.4s | 
|  | add	v17.4s, v17.4s, v22.4s | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  |  | 
|  | eor	v20.16b, v20.16b, v1.16b | 
|  | eor	v21.16b, v21.16b, v6.16b | 
|  | eor	v22.16b, v22.16b, v11.16b | 
|  | eor	v23.16b, v23.16b, v16.16b | 
|  |  | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  |  | 
|  | eor	v20.16b, v20.16b, v2.16b | 
|  | eor	v21.16b, v21.16b, v7.16b | 
|  | eor	v22.16b, v22.16b, v12.16b | 
|  | eor	v23.16b, v23.16b, v17.16b | 
|  |  | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | sub	x2, x2, #128 | 
|  | b	Lopen_tail_64_store | 
|  |  | 
|  | Lopen_tail_128: | 
|  | // We need two more blocks | 
|  | mov	v0.16b, v24.16b | 
|  | mov	v1.16b, v24.16b | 
|  | mov	v5.16b, v28.16b | 
|  | mov	v6.16b, v28.16b | 
|  | mov	v10.16b, v29.16b | 
|  | mov	v11.16b, v29.16b | 
|  | mov	v15.16b, v30.16b | 
|  | mov	v16.16b, v30.16b | 
|  | eor	v23.16b, v23.16b, v23.16b | 
|  | eor	v22.16b, v22.16b, v22.16b | 
|  | ins	v23.s[0], v25.s[0] | 
|  | ins	v22.d[0], x15 | 
|  | add	v22.4s, v22.4s, v23.4s | 
|  |  | 
|  | add	v15.4s, v15.4s, v22.4s | 
|  | add	v16.4s, v16.4s, v23.4s | 
|  |  | 
|  | mov	x6, #10 | 
|  | sub	x6, x6, x4 | 
|  |  | 
|  | Lopen_tail_128_rounds: | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | rev32	v15.8h, v15.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  | ext	v5.16b, v5.16b, v5.16b, #4 | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v15.16b, v15.16b, v15.16b, #12 | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | rev32	v16.8h, v16.8h | 
|  |  | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | eor	v6.16b, v6.16b, v11.16b | 
|  | ushr	v20.4s, v6.4s, #20 | 
|  | sli	v20.4s, v6.4s, #12 | 
|  | add	v1.4s, v1.4s, v20.4s | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  |  | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | eor	v20.16b, v20.16b, v11.16b | 
|  | ushr	v6.4s, v20.4s, #25 | 
|  | sli	v6.4s, v20.4s, #7 | 
|  | ext	v6.16b, v6.16b, v6.16b, #4 | 
|  | ext	v11.16b, v11.16b, v11.16b, #8 | 
|  | ext	v16.16b, v16.16b, v16.16b, #12 | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | rev32	v15.8h, v15.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  | ext	v5.16b, v5.16b, v5.16b, #12 | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v15.16b, v15.16b, v15.16b, #4 | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | rev32	v16.8h, v16.8h | 
|  |  | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | eor	v6.16b, v6.16b, v11.16b | 
|  | ushr	v20.4s, v6.4s, #20 | 
|  | sli	v20.4s, v6.4s, #12 | 
|  | add	v1.4s, v1.4s, v20.4s | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  |  | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | eor	v20.16b, v20.16b, v11.16b | 
|  | ushr	v6.4s, v20.4s, #25 | 
|  | sli	v6.4s, v20.4s, #7 | 
|  | ext	v6.16b, v6.16b, v6.16b, #12 | 
|  | ext	v11.16b, v11.16b, v11.16b, #8 | 
|  | ext	v16.16b, v16.16b, v16.16b, #4 | 
|  | subs	x6, x6, #1 | 
|  | b.gt	Lopen_tail_128_rounds | 
|  | cbz	x4, Lopen_tail_128_rounds_done | 
|  | subs	x4, x4, #1 | 
|  | ldp	x11, x12, [x3], 16 | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | b	Lopen_tail_128_rounds | 
|  |  | 
|  | Lopen_tail_128_rounds_done: | 
|  | add	v0.4s, v0.4s, v24.4s | 
|  | add	v1.4s, v1.4s, v24.4s | 
|  | add	v5.4s, v5.4s, v28.4s | 
|  | add	v6.4s, v6.4s, v28.4s | 
|  | add	v10.4s, v10.4s, v29.4s | 
|  | add	v11.4s, v11.4s, v29.4s | 
|  | add	v15.4s, v15.4s, v30.4s | 
|  | add	v16.4s, v16.4s, v30.4s | 
|  | add	v15.4s, v15.4s, v22.4s | 
|  | add	v16.4s, v16.4s, v23.4s | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  |  | 
|  | eor	v20.16b, v20.16b, v1.16b | 
|  | eor	v21.16b, v21.16b, v6.16b | 
|  | eor	v22.16b, v22.16b, v11.16b | 
|  | eor	v23.16b, v23.16b, v16.16b | 
|  |  | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  | sub	x2, x2, #64 | 
|  |  | 
|  | b	Lopen_tail_64_store | 
|  |  | 
|  | Lopen_tail_64: | 
|  | // We just need a single block | 
|  | mov	v0.16b, v24.16b | 
|  | mov	v5.16b, v28.16b | 
|  | mov	v10.16b, v29.16b | 
|  | mov	v15.16b, v30.16b | 
|  | eor	v23.16b, v23.16b, v23.16b | 
|  | ins	v23.s[0], v25.s[0] | 
|  | add	v15.4s, v15.4s, v23.4s | 
|  |  | 
|  | mov	x6, #10 | 
|  | sub	x6, x6, x4 | 
|  |  | 
|  | Lopen_tail_64_rounds: | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | rev32	v15.8h, v15.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  | ext	v5.16b, v5.16b, v5.16b, #4 | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v15.16b, v15.16b, v15.16b, #12 | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | rev32	v15.8h, v15.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  | ext	v5.16b, v5.16b, v5.16b, #12 | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v15.16b, v15.16b, v15.16b, #4 | 
|  | subs	x6, x6, #1 | 
|  | b.gt	Lopen_tail_64_rounds | 
|  | cbz	x4, Lopen_tail_64_rounds_done | 
|  | subs	x4, x4, #1 | 
|  | ldp	x11, x12, [x3], 16 | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | b	Lopen_tail_64_rounds | 
|  |  | 
|  | Lopen_tail_64_rounds_done: | 
|  | add	v0.4s, v0.4s, v24.4s | 
|  | add	v5.4s, v5.4s, v28.4s | 
|  | add	v10.4s, v10.4s, v29.4s | 
|  | add	v15.4s, v15.4s, v30.4s | 
|  | add	v15.4s, v15.4s, v23.4s | 
|  |  | 
|  | Lopen_tail_64_store: | 
|  | cmp	x2, #16 | 
|  | b.lt	Lopen_tail_16 | 
|  |  | 
|  | ld1	{v20.16b}, [x1], #16 | 
|  | eor	v20.16b, v20.16b, v0.16b | 
|  | st1	{v20.16b}, [x0], #16 | 
|  | mov	v0.16b, v5.16b | 
|  | mov	v5.16b, v10.16b | 
|  | mov	v10.16b, v15.16b | 
|  | sub	x2, x2, #16 | 
|  | b	Lopen_tail_64_store | 
|  |  | 
|  | Lopen_tail_16: | 
|  | // Here we handle the last [0,16) bytes that require a padded block | 
|  | cbz	x2, Lopen_finalize | 
|  |  | 
|  | eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext | 
|  | eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask | 
|  | not	v22.16b, v20.16b | 
|  |  | 
|  | add	x7, x1, x2 | 
|  | mov	x6, x2 | 
|  |  | 
|  | Lopen_tail_16_compose: | 
|  | ext	v20.16b, v20.16b, v20.16b, #15 | 
|  | ldrb	w11, [x7, #-1]! | 
|  | mov	v20.b[0], w11 | 
|  | ext	v21.16b, v22.16b, v21.16b, #15 | 
|  | subs	x2, x2, #1 | 
|  | b.gt	Lopen_tail_16_compose | 
|  |  | 
|  | and	v20.16b, v20.16b, v21.16b | 
|  | // Hash in the final padded block | 
|  | mov	x11, v20.d[0] | 
|  | mov	x12, v20.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | eor	v20.16b, v20.16b, v0.16b | 
|  |  | 
|  | Lopen_tail_16_store: | 
|  | umov	w11, v20.b[0] | 
|  | strb	w11, [x0], #1 | 
|  | ext	v20.16b, v20.16b, v20.16b, #1 | 
|  | subs	x6, x6, #1 | 
|  | b.gt	Lopen_tail_16_store | 
|  |  | 
|  | Lopen_finalize: | 
|  | mov	x11, v31.d[0] | 
|  | mov	x12, v31.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | // Final reduction step | 
|  | sub	x12, xzr, x15 | 
|  | orr	x13, xzr, #3 | 
|  | subs	x11, x8, #-5 | 
|  | sbcs	x12, x9, x12 | 
|  | sbcs	x13, x10, x13 | 
|  | csel	x8, x11, x8, cs | 
|  | csel	x9, x12, x9, cs | 
|  | csel	x10, x13, x10, cs | 
|  | mov	x11, v27.d[0] | 
|  | mov	x12, v27.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  |  | 
|  | stp	x8, x9, [x5] | 
|  |  | 
|  | ldp	d8, d9, [sp, #16] | 
|  | ldp	d10, d11, [sp, #32] | 
|  | ldp	d12, d13, [sp, #48] | 
|  | ldp	d14, d15, [sp, #64] | 
|  | .cfi_restore	b15 | 
|  | .cfi_restore	b14 | 
|  | .cfi_restore	b13 | 
|  | .cfi_restore	b12 | 
|  | .cfi_restore	b11 | 
|  | .cfi_restore	b10 | 
|  | .cfi_restore	b9 | 
|  | .cfi_restore	b8 | 
|  | ldp	x29, x30, [sp], 80 | 
|  | .cfi_restore	w29 | 
|  | .cfi_restore	w30 | 
|  | .cfi_def_cfa_offset	0 | 
|  | AARCH64_VALIDATE_LINK_REGISTER | 
|  | ret | 
|  |  | 
|  | Lopen_128: | 
|  | // On some architectures preparing 5 blocks for small buffers is wasteful | 
|  | eor	v25.16b, v25.16b, v25.16b | 
|  | mov	x11, #1 | 
|  | mov	v25.s[0], w11 | 
|  | mov	v0.16b, v24.16b | 
|  | mov	v1.16b, v24.16b | 
|  | mov	v2.16b, v24.16b | 
|  | mov	v5.16b, v28.16b | 
|  | mov	v6.16b, v28.16b | 
|  | mov	v7.16b, v28.16b | 
|  | mov	v10.16b, v29.16b | 
|  | mov	v11.16b, v29.16b | 
|  | mov	v12.16b, v29.16b | 
|  | mov	v17.16b, v30.16b | 
|  | add	v15.4s, v17.4s, v25.4s | 
|  | add	v16.4s, v15.4s, v25.4s | 
|  |  | 
|  | mov	x6, #10 | 
|  |  | 
|  | Lopen_128_rounds: | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | eor	v6.16b, v6.16b, v11.16b | 
|  | eor	v7.16b, v7.16b, v12.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v6.4s, #20 | 
|  | sli	v5.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v5.4s | 
|  | add	v2.4s, v2.4s, v6.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v5.4s, #25 | 
|  | sli	v6.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v5.16b, v5.16b, v5.16b, #4 | 
|  | ext	v6.16b, v6.16b, v6.16b, #4 | 
|  | ext	v7.16b, v7.16b, v7.16b, #4 | 
|  |  | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v11.16b, v11.16b, v11.16b, #8 | 
|  | ext	v12.16b, v12.16b, v12.16b, #8 | 
|  |  | 
|  | ext	v15.16b, v15.16b, v15.16b, #12 | 
|  | ext	v16.16b, v16.16b, v16.16b, #12 | 
|  | ext	v17.16b, v17.16b, v17.16b, #12 | 
|  | add	v0.4s, v0.4s, v5.4s | 
|  | add	v1.4s, v1.4s, v6.4s | 
|  | add	v2.4s, v2.4s, v7.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | rev32	v15.8h, v15.8h | 
|  | rev32	v16.8h, v16.8h | 
|  | rev32	v17.8h, v17.8h | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v5.16b, v5.16b, v10.16b | 
|  | eor	v6.16b, v6.16b, v11.16b | 
|  | eor	v7.16b, v7.16b, v12.16b | 
|  | ushr	v20.4s, v5.4s, #20 | 
|  | sli	v20.4s, v5.4s, #12 | 
|  | ushr	v5.4s, v6.4s, #20 | 
|  | sli	v5.4s, v6.4s, #12 | 
|  | ushr	v6.4s, v7.4s, #20 | 
|  | sli	v6.4s, v7.4s, #12 | 
|  |  | 
|  | add	v0.4s, v0.4s, v20.4s | 
|  | add	v1.4s, v1.4s, v5.4s | 
|  | add	v2.4s, v2.4s, v6.4s | 
|  | eor	v15.16b, v15.16b, v0.16b | 
|  | eor	v16.16b, v16.16b, v1.16b | 
|  | eor	v17.16b, v17.16b, v2.16b | 
|  | tbl	v15.16b, {v15.16b}, v26.16b | 
|  | tbl	v16.16b, {v16.16b}, v26.16b | 
|  | tbl	v17.16b, {v17.16b}, v26.16b | 
|  |  | 
|  | add	v10.4s, v10.4s, v15.4s | 
|  | add	v11.4s, v11.4s, v16.4s | 
|  | add	v12.4s, v12.4s, v17.4s | 
|  | eor	v20.16b, v20.16b, v10.16b | 
|  | eor	v5.16b, v5.16b, v11.16b | 
|  | eor	v6.16b, v6.16b, v12.16b | 
|  | ushr	v7.4s, v6.4s, #25 | 
|  | sli	v7.4s, v6.4s, #7 | 
|  | ushr	v6.4s, v5.4s, #25 | 
|  | sli	v6.4s, v5.4s, #7 | 
|  | ushr	v5.4s, v20.4s, #25 | 
|  | sli	v5.4s, v20.4s, #7 | 
|  |  | 
|  | ext	v5.16b, v5.16b, v5.16b, #12 | 
|  | ext	v6.16b, v6.16b, v6.16b, #12 | 
|  | ext	v7.16b, v7.16b, v7.16b, #12 | 
|  |  | 
|  | ext	v10.16b, v10.16b, v10.16b, #8 | 
|  | ext	v11.16b, v11.16b, v11.16b, #8 | 
|  | ext	v12.16b, v12.16b, v12.16b, #8 | 
|  |  | 
|  | ext	v15.16b, v15.16b, v15.16b, #4 | 
|  | ext	v16.16b, v16.16b, v16.16b, #4 | 
|  | ext	v17.16b, v17.16b, v17.16b, #4 | 
|  | subs	x6, x6, #1 | 
|  | b.hi	Lopen_128_rounds | 
|  |  | 
|  | add	v0.4s, v0.4s, v24.4s | 
|  | add	v1.4s, v1.4s, v24.4s | 
|  | add	v2.4s, v2.4s, v24.4s | 
|  |  | 
|  | add	v5.4s, v5.4s, v28.4s | 
|  | add	v6.4s, v6.4s, v28.4s | 
|  | add	v7.4s, v7.4s, v28.4s | 
|  |  | 
|  | add	v10.4s, v10.4s, v29.4s | 
|  | add	v11.4s, v11.4s, v29.4s | 
|  |  | 
|  | add	v30.4s, v30.4s, v25.4s | 
|  | add	v15.4s, v15.4s, v30.4s | 
|  | add	v30.4s, v30.4s, v25.4s | 
|  | add	v16.4s, v16.4s, v30.4s | 
|  |  | 
|  | and	v2.16b, v2.16b, v27.16b | 
|  | mov	x16, v2.d[0] // Move the R key to GPRs | 
|  | mov	x17, v2.d[1] | 
|  | mov	v27.16b, v7.16b // Store the S key | 
|  |  | 
|  | bl	Lpoly_hash_ad_internal | 
|  |  | 
|  | Lopen_128_store: | 
|  | cmp	x2, #64 | 
|  | b.lt	Lopen_128_store_64 | 
|  |  | 
|  | ld1	{v20.16b - v23.16b}, [x1], #64 | 
|  |  | 
|  | mov	x11, v20.d[0] | 
|  | mov	x12, v20.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | mov	x11, v21.d[0] | 
|  | mov	x12, v21.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | mov	x11, v22.d[0] | 
|  | mov	x12, v22.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | mov	x11, v23.d[0] | 
|  | mov	x12, v23.d[1] | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  |  | 
|  | eor	v20.16b, v20.16b, v0.16b | 
|  | eor	v21.16b, v21.16b, v5.16b | 
|  | eor	v22.16b, v22.16b, v10.16b | 
|  | eor	v23.16b, v23.16b, v15.16b | 
|  |  | 
|  | st1	{v20.16b - v23.16b}, [x0], #64 | 
|  |  | 
|  | sub	x2, x2, #64 | 
|  |  | 
|  | mov	v0.16b, v1.16b | 
|  | mov	v5.16b, v6.16b | 
|  | mov	v10.16b, v11.16b | 
|  | mov	v15.16b, v16.16b | 
|  |  | 
|  | Lopen_128_store_64: | 
|  |  | 
|  | lsr	x4, x2, #4 | 
|  | mov	x3, x1 | 
|  |  | 
|  | Lopen_128_hash_64: | 
|  | cbz	x4, Lopen_tail_64_store | 
|  | ldp	x11, x12, [x3], 16 | 
|  | adds	x8, x8, x11 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, x15 | 
|  | mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0 | 
|  | umulh	x12, x8, x16 | 
|  | mul	x13, x9, x16 | 
|  | umulh	x14, x9, x16 | 
|  | adds	x12, x12, x13 | 
|  | mul	x13, x10, x16 | 
|  | adc	x13, x13, x14 | 
|  | mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] | 
|  | umulh	x8, x8, x17 | 
|  | adds	x12, x12, x14 | 
|  | mul	x14, x9, x17 | 
|  | umulh	x9, x9, x17 | 
|  | adcs	x14, x14, x8 | 
|  | mul	x10, x10, x17 | 
|  | adc	x10, x10, x9 | 
|  | adds	x13, x13, x14 | 
|  | adc	x14, x10, xzr | 
|  | and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3) | 
|  | and	x8, x13, #-4 | 
|  | extr	x13, x14, x13, #2 | 
|  | adds	x8, x8, x11 | 
|  | lsr	x11, x14, #2 | 
|  | adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits | 
|  | adds	x8, x8, x13 | 
|  | adcs	x9, x9, x12 | 
|  | adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most | 
|  | sub	x4, x4, #1 | 
|  | b	Lopen_128_hash_64 | 
|  | .cfi_endproc | 
|  |  | 
|  | #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) |