| // This file is generated from a similarly-named Perl script in the BoringSSL |
| // source tree. Do not edit by hand. |
| |
| #include <openssl/asm_base.h> |
| |
| #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) |
| #include <openssl/arm_arch.h> |
| .section .rodata |
| |
| .align 7 |
| Lchacha20_consts: |
| .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' |
| Linc: |
| .long 1,2,3,4 |
| Lrol8: |
| .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 |
| Lclamp: |
| .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC |
| |
| .text |
| |
| .def Lpoly_hash_ad_internal |
| .type 32 |
| .endef |
| .align 6 |
| Lpoly_hash_ad_internal: |
| .cfi_startproc |
| cbnz x4, Lpoly_hash_intro |
| ret |
| |
| Lpoly_hash_intro: |
| cmp x4, #16 |
| b.lt Lpoly_hash_ad_tail |
| ldp x11, x12, [x3], 16 |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| sub x4, x4, #16 |
| b Lpoly_hash_ad_internal |
| |
| Lpoly_hash_ad_tail: |
| cbz x4, Lpoly_hash_ad_ret |
| |
| eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD |
| sub x4, x4, #1 |
| |
| Lpoly_hash_tail_16_compose: |
| ext v20.16b, v20.16b, v20.16b, #15 |
| ldrb w11, [x3, x4] |
| mov v20.b[0], w11 |
| subs x4, x4, #1 |
| b.ge Lpoly_hash_tail_16_compose |
| mov x11, v20.d[0] |
| mov x12, v20.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| |
| Lpoly_hash_ad_ret: |
| ret |
| .cfi_endproc |
| |
| |
| ///////////////////////////////// |
| // |
| // void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); |
| // |
| .globl chacha20_poly1305_seal |
| |
| .def chacha20_poly1305_seal |
| .type 32 |
| .endef |
| .align 6 |
| chacha20_poly1305_seal: |
| AARCH64_SIGN_LINK_REGISTER |
| .cfi_startproc |
| stp x29, x30, [sp, #-80]! |
| .cfi_def_cfa_offset 80 |
| .cfi_offset w30, -72 |
| .cfi_offset w29, -80 |
| mov x29, sp |
| // We probably could do .cfi_def_cfa w29, 80 at this point, but since |
| // we don't actually use the frame pointer like that, it's probably not |
| // worth bothering. |
| stp d8, d9, [sp, #16] |
| stp d10, d11, [sp, #32] |
| stp d12, d13, [sp, #48] |
| stp d14, d15, [sp, #64] |
| .cfi_offset b15, -8 |
| .cfi_offset b14, -16 |
| .cfi_offset b13, -24 |
| .cfi_offset b12, -32 |
| .cfi_offset b11, -40 |
| .cfi_offset b10, -48 |
| .cfi_offset b9, -56 |
| .cfi_offset b8, -64 |
| |
| adrp x11, Lchacha20_consts |
| add x11, x11, :lo12:Lchacha20_consts |
| |
| ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values |
| ld1 {v28.16b - v30.16b}, [x5] |
| |
| mov x15, #1 // Prepare the Poly1305 state |
| mov x8, #0 |
| mov x9, #0 |
| mov x10, #0 |
| |
| ldr x12, [x5, #56] // The total cipher text length includes extra_in_len |
| add x12, x12, x2 |
| mov v31.d[0], x4 // Store the input and aad lengths |
| mov v31.d[1], x12 |
| |
| cmp x2, #128 |
| b.le Lseal_128 // Optimization for smaller buffers |
| |
| // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, |
| // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, |
| // the fifth block (A4-D4) horizontally. |
| ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] |
| mov v4.16b, v24.16b |
| |
| ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 |
| mov v9.16b, v28.16b |
| |
| ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 |
| mov v14.16b, v29.16b |
| |
| ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] |
| add v15.4s, v15.4s, v25.4s |
| mov v19.16b, v30.16b |
| |
| sub x5, x5, #32 |
| |
| mov x6, #10 |
| |
| .align 5 |
| Lseal_init_rounds: |
| add v0.4s, v0.4s, v5.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| add v3.4s, v3.4s, v8.4s |
| add v4.4s, v4.4s, v9.4s |
| |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| eor v18.16b, v18.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| rev32 v18.8h, v18.8h |
| rev32 v19.8h, v19.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| add v13.4s, v13.4s, v18.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v5.16b, v5.16b, v10.16b |
| eor v6.16b, v6.16b, v11.16b |
| eor v7.16b, v7.16b, v12.16b |
| eor v8.16b, v8.16b, v13.16b |
| eor v9.16b, v9.16b, v14.16b |
| |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| ushr v5.4s, v6.4s, #20 |
| sli v5.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| ushr v7.4s, v8.4s, #20 |
| sli v7.4s, v8.4s, #12 |
| ushr v8.4s, v9.4s, #20 |
| sli v8.4s, v9.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v5.4s |
| add v2.4s, v2.4s, v6.4s |
| add v3.4s, v3.4s, v7.4s |
| add v4.4s, v4.4s, v8.4s |
| |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| eor v18.16b, v18.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| tbl v18.16b, {v18.16b}, v26.16b |
| tbl v19.16b, {v19.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| add v13.4s, v13.4s, v18.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v20.16b, v20.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v6.16b, v6.16b, v12.16b |
| eor v7.16b, v7.16b, v13.16b |
| eor v8.16b, v8.16b, v14.16b |
| |
| ushr v9.4s, v8.4s, #25 |
| sli v9.4s, v8.4s, #7 |
| ushr v8.4s, v7.4s, #25 |
| sli v8.4s, v7.4s, #7 |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v5.4s, #25 |
| sli v6.4s, v5.4s, #7 |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| |
| ext v9.16b, v9.16b, v9.16b, #4 |
| ext v14.16b, v14.16b, v14.16b, #8 |
| ext v19.16b, v19.16b, v19.16b, #12 |
| add v0.4s, v0.4s, v6.4s |
| add v1.4s, v1.4s, v7.4s |
| add v2.4s, v2.4s, v8.4s |
| add v3.4s, v3.4s, v5.4s |
| add v4.4s, v4.4s, v9.4s |
| |
| eor v18.16b, v18.16b, v0.16b |
| eor v15.16b, v15.16b, v1.16b |
| eor v16.16b, v16.16b, v2.16b |
| eor v17.16b, v17.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| rev32 v18.8h, v18.8h |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| rev32 v19.8h, v19.8h |
| |
| add v12.4s, v12.4s, v18.4s |
| add v13.4s, v13.4s, v15.4s |
| add v10.4s, v10.4s, v16.4s |
| add v11.4s, v11.4s, v17.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v6.16b, v6.16b, v12.16b |
| eor v7.16b, v7.16b, v13.16b |
| eor v8.16b, v8.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v9.16b, v9.16b, v14.16b |
| |
| ushr v20.4s, v6.4s, #20 |
| sli v20.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| ushr v7.4s, v8.4s, #20 |
| sli v7.4s, v8.4s, #12 |
| ushr v8.4s, v5.4s, #20 |
| sli v8.4s, v5.4s, #12 |
| ushr v5.4s, v9.4s, #20 |
| sli v5.4s, v9.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| add v3.4s, v3.4s, v8.4s |
| add v4.4s, v4.4s, v5.4s |
| |
| eor v18.16b, v18.16b, v0.16b |
| eor v15.16b, v15.16b, v1.16b |
| eor v16.16b, v16.16b, v2.16b |
| eor v17.16b, v17.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| tbl v18.16b, {v18.16b}, v26.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| tbl v19.16b, {v19.16b}, v26.16b |
| |
| add v12.4s, v12.4s, v18.4s |
| add v13.4s, v13.4s, v15.4s |
| add v10.4s, v10.4s, v16.4s |
| add v11.4s, v11.4s, v17.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v20.16b, v20.16b, v12.16b |
| eor v6.16b, v6.16b, v13.16b |
| eor v7.16b, v7.16b, v10.16b |
| eor v8.16b, v8.16b, v11.16b |
| eor v5.16b, v5.16b, v14.16b |
| |
| ushr v9.4s, v5.4s, #25 |
| sli v9.4s, v5.4s, #7 |
| ushr v5.4s, v8.4s, #25 |
| sli v5.4s, v8.4s, #7 |
| ushr v8.4s, v7.4s, #25 |
| sli v8.4s, v7.4s, #7 |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v20.4s, #25 |
| sli v6.4s, v20.4s, #7 |
| |
| ext v9.16b, v9.16b, v9.16b, #12 |
| ext v14.16b, v14.16b, v14.16b, #8 |
| ext v19.16b, v19.16b, v19.16b, #4 |
| subs x6, x6, #1 |
| b.hi Lseal_init_rounds |
| |
| add v15.4s, v15.4s, v25.4s |
| mov x11, #4 |
| dup v20.4s, w11 |
| add v25.4s, v25.4s, v20.4s |
| |
| zip1 v20.4s, v0.4s, v1.4s |
| zip2 v21.4s, v0.4s, v1.4s |
| zip1 v22.4s, v2.4s, v3.4s |
| zip2 v23.4s, v2.4s, v3.4s |
| |
| zip1 v0.2d, v20.2d, v22.2d |
| zip2 v1.2d, v20.2d, v22.2d |
| zip1 v2.2d, v21.2d, v23.2d |
| zip2 v3.2d, v21.2d, v23.2d |
| |
| zip1 v20.4s, v5.4s, v6.4s |
| zip2 v21.4s, v5.4s, v6.4s |
| zip1 v22.4s, v7.4s, v8.4s |
| zip2 v23.4s, v7.4s, v8.4s |
| |
| zip1 v5.2d, v20.2d, v22.2d |
| zip2 v6.2d, v20.2d, v22.2d |
| zip1 v7.2d, v21.2d, v23.2d |
| zip2 v8.2d, v21.2d, v23.2d |
| |
| zip1 v20.4s, v10.4s, v11.4s |
| zip2 v21.4s, v10.4s, v11.4s |
| zip1 v22.4s, v12.4s, v13.4s |
| zip2 v23.4s, v12.4s, v13.4s |
| |
| zip1 v10.2d, v20.2d, v22.2d |
| zip2 v11.2d, v20.2d, v22.2d |
| zip1 v12.2d, v21.2d, v23.2d |
| zip2 v13.2d, v21.2d, v23.2d |
| |
| zip1 v20.4s, v15.4s, v16.4s |
| zip2 v21.4s, v15.4s, v16.4s |
| zip1 v22.4s, v17.4s, v18.4s |
| zip2 v23.4s, v17.4s, v18.4s |
| |
| zip1 v15.2d, v20.2d, v22.2d |
| zip2 v16.2d, v20.2d, v22.2d |
| zip1 v17.2d, v21.2d, v23.2d |
| zip2 v18.2d, v21.2d, v23.2d |
| |
| add v4.4s, v4.4s, v24.4s |
| add v9.4s, v9.4s, v28.4s |
| and v4.16b, v4.16b, v27.16b |
| |
| add v0.4s, v0.4s, v24.4s |
| add v5.4s, v5.4s, v28.4s |
| add v10.4s, v10.4s, v29.4s |
| add v15.4s, v15.4s, v30.4s |
| |
| add v1.4s, v1.4s, v24.4s |
| add v6.4s, v6.4s, v28.4s |
| add v11.4s, v11.4s, v29.4s |
| add v16.4s, v16.4s, v30.4s |
| |
| add v2.4s, v2.4s, v24.4s |
| add v7.4s, v7.4s, v28.4s |
| add v12.4s, v12.4s, v29.4s |
| add v17.4s, v17.4s, v30.4s |
| |
| add v3.4s, v3.4s, v24.4s |
| add v8.4s, v8.4s, v28.4s |
| add v13.4s, v13.4s, v29.4s |
| add v18.4s, v18.4s, v30.4s |
| |
| mov x16, v4.d[0] // Move the R key to GPRs |
| mov x17, v4.d[1] |
| mov v27.16b, v9.16b // Store the S key |
| |
| bl Lpoly_hash_ad_internal |
| |
| mov x3, x0 |
| cmp x2, #256 |
| b.le Lseal_tail |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v0.16b |
| eor v21.16b, v21.16b, v5.16b |
| eor v22.16b, v22.16b, v10.16b |
| eor v23.16b, v23.16b, v15.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v1.16b |
| eor v21.16b, v21.16b, v6.16b |
| eor v22.16b, v22.16b, v11.16b |
| eor v23.16b, v23.16b, v16.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v2.16b |
| eor v21.16b, v21.16b, v7.16b |
| eor v22.16b, v22.16b, v12.16b |
| eor v23.16b, v23.16b, v17.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v3.16b |
| eor v21.16b, v21.16b, v8.16b |
| eor v22.16b, v22.16b, v13.16b |
| eor v23.16b, v23.16b, v18.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| sub x2, x2, #256 |
| |
| mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds |
| mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 |
| |
| Lseal_main_loop: |
| adrp x11, Lchacha20_consts |
| add x11, x11, :lo12:Lchacha20_consts |
| |
| ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] |
| mov v4.16b, v24.16b |
| |
| ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 |
| mov v9.16b, v28.16b |
| |
| ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 |
| mov v14.16b, v29.16b |
| |
| ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] |
| add v15.4s, v15.4s, v25.4s |
| mov v19.16b, v30.16b |
| |
| eor v20.16b, v20.16b, v20.16b //zero |
| not v21.16b, v20.16b // -1 |
| sub v21.4s, v25.4s, v21.4s // Add +1 |
| ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) |
| add v19.4s, v19.4s, v20.4s |
| |
| sub x5, x5, #32 |
| .align 5 |
| Lseal_main_loop_rounds: |
| add v0.4s, v0.4s, v5.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| add v3.4s, v3.4s, v8.4s |
| add v4.4s, v4.4s, v9.4s |
| |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| eor v18.16b, v18.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| rev32 v18.8h, v18.8h |
| rev32 v19.8h, v19.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| add v13.4s, v13.4s, v18.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v5.16b, v5.16b, v10.16b |
| eor v6.16b, v6.16b, v11.16b |
| eor v7.16b, v7.16b, v12.16b |
| eor v8.16b, v8.16b, v13.16b |
| eor v9.16b, v9.16b, v14.16b |
| |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| ushr v5.4s, v6.4s, #20 |
| sli v5.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| ushr v7.4s, v8.4s, #20 |
| sli v7.4s, v8.4s, #12 |
| ushr v8.4s, v9.4s, #20 |
| sli v8.4s, v9.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v5.4s |
| add v2.4s, v2.4s, v6.4s |
| add v3.4s, v3.4s, v7.4s |
| add v4.4s, v4.4s, v8.4s |
| |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| eor v18.16b, v18.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| tbl v18.16b, {v18.16b}, v26.16b |
| tbl v19.16b, {v19.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| add v13.4s, v13.4s, v18.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v20.16b, v20.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v6.16b, v6.16b, v12.16b |
| eor v7.16b, v7.16b, v13.16b |
| eor v8.16b, v8.16b, v14.16b |
| |
| ushr v9.4s, v8.4s, #25 |
| sli v9.4s, v8.4s, #7 |
| ushr v8.4s, v7.4s, #25 |
| sli v8.4s, v7.4s, #7 |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v5.4s, #25 |
| sli v6.4s, v5.4s, #7 |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| |
| ext v9.16b, v9.16b, v9.16b, #4 |
| ext v14.16b, v14.16b, v14.16b, #8 |
| ext v19.16b, v19.16b, v19.16b, #12 |
| ldp x11, x12, [x3], 16 |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| add v0.4s, v0.4s, v6.4s |
| add v1.4s, v1.4s, v7.4s |
| add v2.4s, v2.4s, v8.4s |
| add v3.4s, v3.4s, v5.4s |
| add v4.4s, v4.4s, v9.4s |
| |
| eor v18.16b, v18.16b, v0.16b |
| eor v15.16b, v15.16b, v1.16b |
| eor v16.16b, v16.16b, v2.16b |
| eor v17.16b, v17.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| rev32 v18.8h, v18.8h |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| rev32 v19.8h, v19.8h |
| |
| add v12.4s, v12.4s, v18.4s |
| add v13.4s, v13.4s, v15.4s |
| add v10.4s, v10.4s, v16.4s |
| add v11.4s, v11.4s, v17.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v6.16b, v6.16b, v12.16b |
| eor v7.16b, v7.16b, v13.16b |
| eor v8.16b, v8.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v9.16b, v9.16b, v14.16b |
| |
| ushr v20.4s, v6.4s, #20 |
| sli v20.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| ushr v7.4s, v8.4s, #20 |
| sli v7.4s, v8.4s, #12 |
| ushr v8.4s, v5.4s, #20 |
| sli v8.4s, v5.4s, #12 |
| ushr v5.4s, v9.4s, #20 |
| sli v5.4s, v9.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| add v3.4s, v3.4s, v8.4s |
| add v4.4s, v4.4s, v5.4s |
| |
| eor v18.16b, v18.16b, v0.16b |
| eor v15.16b, v15.16b, v1.16b |
| eor v16.16b, v16.16b, v2.16b |
| eor v17.16b, v17.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| tbl v18.16b, {v18.16b}, v26.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| tbl v19.16b, {v19.16b}, v26.16b |
| |
| add v12.4s, v12.4s, v18.4s |
| add v13.4s, v13.4s, v15.4s |
| add v10.4s, v10.4s, v16.4s |
| add v11.4s, v11.4s, v17.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v20.16b, v20.16b, v12.16b |
| eor v6.16b, v6.16b, v13.16b |
| eor v7.16b, v7.16b, v10.16b |
| eor v8.16b, v8.16b, v11.16b |
| eor v5.16b, v5.16b, v14.16b |
| |
| ushr v9.4s, v5.4s, #25 |
| sli v9.4s, v5.4s, #7 |
| ushr v5.4s, v8.4s, #25 |
| sli v5.4s, v8.4s, #7 |
| ushr v8.4s, v7.4s, #25 |
| sli v8.4s, v7.4s, #7 |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v20.4s, #25 |
| sli v6.4s, v20.4s, #7 |
| |
| ext v9.16b, v9.16b, v9.16b, #12 |
| ext v14.16b, v14.16b, v14.16b, #8 |
| ext v19.16b, v19.16b, v19.16b, #4 |
| subs x6, x6, #1 |
| b.ge Lseal_main_loop_rounds |
| ldp x11, x12, [x3], 16 |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| subs x7, x7, #1 |
| b.gt Lseal_main_loop_rounds |
| |
| eor v20.16b, v20.16b, v20.16b //zero |
| not v21.16b, v20.16b // -1 |
| sub v21.4s, v25.4s, v21.4s // Add +1 |
| ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) |
| add v19.4s, v19.4s, v20.4s |
| |
| add v15.4s, v15.4s, v25.4s |
| mov x11, #5 |
| dup v20.4s, w11 |
| add v25.4s, v25.4s, v20.4s |
| |
| zip1 v20.4s, v0.4s, v1.4s |
| zip2 v21.4s, v0.4s, v1.4s |
| zip1 v22.4s, v2.4s, v3.4s |
| zip2 v23.4s, v2.4s, v3.4s |
| |
| zip1 v0.2d, v20.2d, v22.2d |
| zip2 v1.2d, v20.2d, v22.2d |
| zip1 v2.2d, v21.2d, v23.2d |
| zip2 v3.2d, v21.2d, v23.2d |
| |
| zip1 v20.4s, v5.4s, v6.4s |
| zip2 v21.4s, v5.4s, v6.4s |
| zip1 v22.4s, v7.4s, v8.4s |
| zip2 v23.4s, v7.4s, v8.4s |
| |
| zip1 v5.2d, v20.2d, v22.2d |
| zip2 v6.2d, v20.2d, v22.2d |
| zip1 v7.2d, v21.2d, v23.2d |
| zip2 v8.2d, v21.2d, v23.2d |
| |
| zip1 v20.4s, v10.4s, v11.4s |
| zip2 v21.4s, v10.4s, v11.4s |
| zip1 v22.4s, v12.4s, v13.4s |
| zip2 v23.4s, v12.4s, v13.4s |
| |
| zip1 v10.2d, v20.2d, v22.2d |
| zip2 v11.2d, v20.2d, v22.2d |
| zip1 v12.2d, v21.2d, v23.2d |
| zip2 v13.2d, v21.2d, v23.2d |
| |
| zip1 v20.4s, v15.4s, v16.4s |
| zip2 v21.4s, v15.4s, v16.4s |
| zip1 v22.4s, v17.4s, v18.4s |
| zip2 v23.4s, v17.4s, v18.4s |
| |
| zip1 v15.2d, v20.2d, v22.2d |
| zip2 v16.2d, v20.2d, v22.2d |
| zip1 v17.2d, v21.2d, v23.2d |
| zip2 v18.2d, v21.2d, v23.2d |
| |
| add v0.4s, v0.4s, v24.4s |
| add v5.4s, v5.4s, v28.4s |
| add v10.4s, v10.4s, v29.4s |
| add v15.4s, v15.4s, v30.4s |
| |
| add v1.4s, v1.4s, v24.4s |
| add v6.4s, v6.4s, v28.4s |
| add v11.4s, v11.4s, v29.4s |
| add v16.4s, v16.4s, v30.4s |
| |
| add v2.4s, v2.4s, v24.4s |
| add v7.4s, v7.4s, v28.4s |
| add v12.4s, v12.4s, v29.4s |
| add v17.4s, v17.4s, v30.4s |
| |
| add v3.4s, v3.4s, v24.4s |
| add v8.4s, v8.4s, v28.4s |
| add v13.4s, v13.4s, v29.4s |
| add v18.4s, v18.4s, v30.4s |
| |
| add v4.4s, v4.4s, v24.4s |
| add v9.4s, v9.4s, v28.4s |
| add v14.4s, v14.4s, v29.4s |
| add v19.4s, v19.4s, v30.4s |
| |
| cmp x2, #320 |
| b.le Lseal_tail |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v0.16b |
| eor v21.16b, v21.16b, v5.16b |
| eor v22.16b, v22.16b, v10.16b |
| eor v23.16b, v23.16b, v15.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v1.16b |
| eor v21.16b, v21.16b, v6.16b |
| eor v22.16b, v22.16b, v11.16b |
| eor v23.16b, v23.16b, v16.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v2.16b |
| eor v21.16b, v21.16b, v7.16b |
| eor v22.16b, v22.16b, v12.16b |
| eor v23.16b, v23.16b, v17.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v3.16b |
| eor v21.16b, v21.16b, v8.16b |
| eor v22.16b, v22.16b, v13.16b |
| eor v23.16b, v23.16b, v18.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v4.16b |
| eor v21.16b, v21.16b, v9.16b |
| eor v22.16b, v22.16b, v14.16b |
| eor v23.16b, v23.16b, v19.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| sub x2, x2, #320 |
| |
| mov x6, #0 |
| mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration |
| |
| b Lseal_main_loop |
| |
| Lseal_tail: |
| // This part of the function handles the storage and authentication of the last [0,320) bytes |
| // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. |
| cmp x2, #64 |
| b.lt Lseal_tail_64 |
| |
| // Store and authenticate 64B blocks per iteration |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| |
| eor v20.16b, v20.16b, v0.16b |
| eor v21.16b, v21.16b, v5.16b |
| eor v22.16b, v22.16b, v10.16b |
| eor v23.16b, v23.16b, v15.16b |
| mov x11, v20.d[0] |
| mov x12, v20.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| mov x11, v21.d[0] |
| mov x12, v21.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| mov x11, v22.d[0] |
| mov x12, v22.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| mov x11, v23.d[0] |
| mov x12, v23.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| sub x2, x2, #64 |
| |
| // Shift the state left by 64 bytes for the next iteration of the loop |
| mov v0.16b, v1.16b |
| mov v5.16b, v6.16b |
| mov v10.16b, v11.16b |
| mov v15.16b, v16.16b |
| |
| mov v1.16b, v2.16b |
| mov v6.16b, v7.16b |
| mov v11.16b, v12.16b |
| mov v16.16b, v17.16b |
| |
| mov v2.16b, v3.16b |
| mov v7.16b, v8.16b |
| mov v12.16b, v13.16b |
| mov v17.16b, v18.16b |
| |
| mov v3.16b, v4.16b |
| mov v8.16b, v9.16b |
| mov v13.16b, v14.16b |
| mov v18.16b, v19.16b |
| |
| b Lseal_tail |
| |
| Lseal_tail_64: |
| ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr |
| |
| // Here we handle the last [0,64) bytes of plaintext |
| cmp x2, #16 |
| b.lt Lseal_tail_16 |
| // Each iteration encrypt and authenticate a 16B block |
| ld1 {v20.16b}, [x1], #16 |
| eor v20.16b, v20.16b, v0.16b |
| mov x11, v20.d[0] |
| mov x12, v20.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| st1 {v20.16b}, [x0], #16 |
| |
| sub x2, x2, #16 |
| |
| // Shift the state left by 16 bytes for the next iteration of the loop |
| mov v0.16b, v5.16b |
| mov v5.16b, v10.16b |
| mov v10.16b, v15.16b |
| |
| b Lseal_tail_64 |
| |
| Lseal_tail_16: |
| // Here we handle the last [0,16) bytes of ciphertext that require a padded block |
| cbz x2, Lseal_hash_extra |
| |
| eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in |
| eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes |
| not v22.16b, v20.16b |
| |
| mov x6, x2 |
| add x1, x1, x2 |
| |
| cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding |
| |
| mov x7, #16 // We need to load some extra_in first for padding |
| sub x7, x7, x2 |
| cmp x4, x7 |
| csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register |
| mov x12, x7 |
| add x3, x3, x7 |
| sub x4, x4, x7 |
| |
| Lseal_tail16_compose_extra_in: |
| ext v20.16b, v20.16b, v20.16b, #15 |
| ldrb w11, [x3, #-1]! |
| mov v20.b[0], w11 |
| subs x7, x7, #1 |
| b.gt Lseal_tail16_compose_extra_in |
| |
| add x3, x3, x12 |
| |
| Lseal_tail_16_compose: |
| ext v20.16b, v20.16b, v20.16b, #15 |
| ldrb w11, [x1, #-1]! |
| mov v20.b[0], w11 |
| ext v21.16b, v22.16b, v21.16b, #15 |
| subs x2, x2, #1 |
| b.gt Lseal_tail_16_compose |
| |
| and v0.16b, v0.16b, v21.16b |
| eor v20.16b, v20.16b, v0.16b |
| mov v21.16b, v20.16b |
| |
| Lseal_tail_16_store: |
| umov w11, v20.b[0] |
| strb w11, [x0], #1 |
| ext v20.16b, v20.16b, v20.16b, #1 |
| subs x6, x6, #1 |
| b.gt Lseal_tail_16_store |
| |
| // Hash in the final ct block concatenated with extra_in |
| mov x11, v21.d[0] |
| mov x12, v21.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| |
| Lseal_hash_extra: |
| cbz x4, Lseal_finalize |
| |
| Lseal_hash_extra_loop: |
| cmp x4, #16 |
| b.lt Lseal_hash_extra_tail |
| ld1 {v20.16b}, [x3], #16 |
| mov x11, v20.d[0] |
| mov x12, v20.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| sub x4, x4, #16 |
| b Lseal_hash_extra_loop |
| |
| Lseal_hash_extra_tail: |
| cbz x4, Lseal_finalize |
| eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext |
| add x3, x3, x4 |
| |
| Lseal_hash_extra_load: |
| ext v20.16b, v20.16b, v20.16b, #15 |
| ldrb w11, [x3, #-1]! |
| mov v20.b[0], w11 |
| subs x4, x4, #1 |
| b.gt Lseal_hash_extra_load |
| |
| // Hash in the final padded extra_in blcok |
| mov x11, v20.d[0] |
| mov x12, v20.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| |
| Lseal_finalize: |
| mov x11, v31.d[0] |
| mov x12, v31.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| // Final reduction step |
| sub x12, xzr, x15 |
| orr x13, xzr, #3 |
| subs x11, x8, #-5 |
| sbcs x12, x9, x12 |
| sbcs x13, x10, x13 |
| csel x8, x11, x8, cs |
| csel x9, x12, x9, cs |
| csel x10, x13, x10, cs |
| mov x11, v27.d[0] |
| mov x12, v27.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| |
| stp x8, x9, [x5] |
| |
| ldp d8, d9, [sp, #16] |
| ldp d10, d11, [sp, #32] |
| ldp d12, d13, [sp, #48] |
| ldp d14, d15, [sp, #64] |
| .cfi_restore b15 |
| .cfi_restore b14 |
| .cfi_restore b13 |
| .cfi_restore b12 |
| .cfi_restore b11 |
| .cfi_restore b10 |
| .cfi_restore b9 |
| .cfi_restore b8 |
| ldp x29, x30, [sp], 80 |
| .cfi_restore w29 |
| .cfi_restore w30 |
| .cfi_def_cfa_offset 0 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| Lseal_128: |
| // On some architectures preparing 5 blocks for small buffers is wasteful |
| eor v25.16b, v25.16b, v25.16b |
| mov x11, #1 |
| mov v25.s[0], w11 |
| mov v0.16b, v24.16b |
| mov v1.16b, v24.16b |
| mov v2.16b, v24.16b |
| mov v5.16b, v28.16b |
| mov v6.16b, v28.16b |
| mov v7.16b, v28.16b |
| mov v10.16b, v29.16b |
| mov v11.16b, v29.16b |
| mov v12.16b, v29.16b |
| mov v17.16b, v30.16b |
| add v15.4s, v17.4s, v25.4s |
| add v16.4s, v15.4s, v25.4s |
| |
| mov x6, #10 |
| |
| Lseal_128_rounds: |
| add v0.4s, v0.4s, v5.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v5.16b, v5.16b, v10.16b |
| eor v6.16b, v6.16b, v11.16b |
| eor v7.16b, v7.16b, v12.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| ushr v5.4s, v6.4s, #20 |
| sli v5.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v5.4s |
| add v2.4s, v2.4s, v6.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v20.16b, v20.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v6.16b, v6.16b, v12.16b |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v5.4s, #25 |
| sli v6.4s, v5.4s, #7 |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| |
| ext v5.16b, v5.16b, v5.16b, #4 |
| ext v6.16b, v6.16b, v6.16b, #4 |
| ext v7.16b, v7.16b, v7.16b, #4 |
| |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v11.16b, v11.16b, v11.16b, #8 |
| ext v12.16b, v12.16b, v12.16b, #8 |
| |
| ext v15.16b, v15.16b, v15.16b, #12 |
| ext v16.16b, v16.16b, v16.16b, #12 |
| ext v17.16b, v17.16b, v17.16b, #12 |
| add v0.4s, v0.4s, v5.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v5.16b, v5.16b, v10.16b |
| eor v6.16b, v6.16b, v11.16b |
| eor v7.16b, v7.16b, v12.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| ushr v5.4s, v6.4s, #20 |
| sli v5.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v5.4s |
| add v2.4s, v2.4s, v6.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v20.16b, v20.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v6.16b, v6.16b, v12.16b |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v5.4s, #25 |
| sli v6.4s, v5.4s, #7 |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| |
| ext v5.16b, v5.16b, v5.16b, #12 |
| ext v6.16b, v6.16b, v6.16b, #12 |
| ext v7.16b, v7.16b, v7.16b, #12 |
| |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v11.16b, v11.16b, v11.16b, #8 |
| ext v12.16b, v12.16b, v12.16b, #8 |
| |
| ext v15.16b, v15.16b, v15.16b, #4 |
| ext v16.16b, v16.16b, v16.16b, #4 |
| ext v17.16b, v17.16b, v17.16b, #4 |
| subs x6, x6, #1 |
| b.hi Lseal_128_rounds |
| |
| add v0.4s, v0.4s, v24.4s |
| add v1.4s, v1.4s, v24.4s |
| add v2.4s, v2.4s, v24.4s |
| |
| add v5.4s, v5.4s, v28.4s |
| add v6.4s, v6.4s, v28.4s |
| add v7.4s, v7.4s, v28.4s |
| |
| // Only the first 32 bytes of the third block (counter = 0) are needed, |
| // so skip updating v12 and v17. |
| add v10.4s, v10.4s, v29.4s |
| add v11.4s, v11.4s, v29.4s |
| |
| add v30.4s, v30.4s, v25.4s |
| add v15.4s, v15.4s, v30.4s |
| add v30.4s, v30.4s, v25.4s |
| add v16.4s, v16.4s, v30.4s |
| |
| and v2.16b, v2.16b, v27.16b |
| mov x16, v2.d[0] // Move the R key to GPRs |
| mov x17, v2.d[1] |
| mov v27.16b, v7.16b // Store the S key |
| |
| bl Lpoly_hash_ad_internal |
| b Lseal_tail |
| .cfi_endproc |
| |
| |
| ///////////////////////////////// |
| // |
| // void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); |
| // |
| .globl chacha20_poly1305_open |
| |
| .def chacha20_poly1305_open |
| .type 32 |
| .endef |
| .align 6 |
| chacha20_poly1305_open: |
| AARCH64_SIGN_LINK_REGISTER |
| .cfi_startproc |
| stp x29, x30, [sp, #-80]! |
| .cfi_def_cfa_offset 80 |
| .cfi_offset w30, -72 |
| .cfi_offset w29, -80 |
| mov x29, sp |
| // We probably could do .cfi_def_cfa w29, 80 at this point, but since |
| // we don't actually use the frame pointer like that, it's probably not |
| // worth bothering. |
| stp d8, d9, [sp, #16] |
| stp d10, d11, [sp, #32] |
| stp d12, d13, [sp, #48] |
| stp d14, d15, [sp, #64] |
| .cfi_offset b15, -8 |
| .cfi_offset b14, -16 |
| .cfi_offset b13, -24 |
| .cfi_offset b12, -32 |
| .cfi_offset b11, -40 |
| .cfi_offset b10, -48 |
| .cfi_offset b9, -56 |
| .cfi_offset b8, -64 |
| |
| adrp x11, Lchacha20_consts |
| add x11, x11, :lo12:Lchacha20_consts |
| |
| ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values |
| ld1 {v28.16b - v30.16b}, [x5] |
| |
| mov x15, #1 // Prepare the Poly1305 state |
| mov x8, #0 |
| mov x9, #0 |
| mov x10, #0 |
| |
| mov v31.d[0], x4 // Store the input and aad lengths |
| mov v31.d[1], x2 |
| |
| cmp x2, #128 |
| b.le Lopen_128 // Optimization for smaller buffers |
| |
| // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys |
| mov v0.16b, v24.16b |
| mov v5.16b, v28.16b |
| mov v10.16b, v29.16b |
| mov v15.16b, v30.16b |
| |
| mov x6, #10 |
| |
| .align 5 |
| Lopen_init_rounds: |
| add v0.4s, v0.4s, v5.4s |
| eor v15.16b, v15.16b, v0.16b |
| rev32 v15.8h, v15.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v5.16b, v5.16b, v10.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| add v0.4s, v0.4s, v20.4s |
| eor v15.16b, v15.16b, v0.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v20.16b, v20.16b, v10.16b |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| ext v5.16b, v5.16b, v5.16b, #4 |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v15.16b, v15.16b, v15.16b, #12 |
| add v0.4s, v0.4s, v5.4s |
| eor v15.16b, v15.16b, v0.16b |
| rev32 v15.8h, v15.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v5.16b, v5.16b, v10.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| add v0.4s, v0.4s, v20.4s |
| eor v15.16b, v15.16b, v0.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v20.16b, v20.16b, v10.16b |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| ext v5.16b, v5.16b, v5.16b, #12 |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v15.16b, v15.16b, v15.16b, #4 |
| subs x6, x6, #1 |
| b.hi Lopen_init_rounds |
| |
| add v0.4s, v0.4s, v24.4s |
| add v5.4s, v5.4s, v28.4s |
| |
| and v0.16b, v0.16b, v27.16b |
| mov x16, v0.d[0] // Move the R key to GPRs |
| mov x17, v0.d[1] |
| mov v27.16b, v5.16b // Store the S key |
| |
| bl Lpoly_hash_ad_internal |
| |
| Lopen_ad_done: |
| mov x3, x1 |
| |
| // Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes |
| Lopen_main_loop: |
| |
| cmp x2, #192 |
| b.lt Lopen_tail |
| |
| adrp x11, Lchacha20_consts |
| add x11, x11, :lo12:Lchacha20_consts |
| |
| ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] |
| mov v4.16b, v24.16b |
| |
| ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 |
| mov v9.16b, v28.16b |
| |
| ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 |
| mov v14.16b, v29.16b |
| |
| ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] |
| sub x5, x5, #32 |
| add v15.4s, v15.4s, v25.4s |
| mov v19.16b, v30.16b |
| |
| eor v20.16b, v20.16b, v20.16b //zero |
| not v21.16b, v20.16b // -1 |
| sub v21.4s, v25.4s, v21.4s // Add +1 |
| ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) |
| add v19.4s, v19.4s, v20.4s |
| |
| lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 |
| sub x4, x4, #10 |
| |
| mov x7, #10 |
| subs x6, x7, x4 |
| subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash |
| csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full |
| |
| cbz x7, Lopen_main_loop_rounds_short |
| |
| .align 5 |
| Lopen_main_loop_rounds: |
| ldp x11, x12, [x3], 16 |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| Lopen_main_loop_rounds_short: |
| add v0.4s, v0.4s, v5.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| add v3.4s, v3.4s, v8.4s |
| add v4.4s, v4.4s, v9.4s |
| |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| eor v18.16b, v18.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| rev32 v18.8h, v18.8h |
| rev32 v19.8h, v19.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| add v13.4s, v13.4s, v18.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v5.16b, v5.16b, v10.16b |
| eor v6.16b, v6.16b, v11.16b |
| eor v7.16b, v7.16b, v12.16b |
| eor v8.16b, v8.16b, v13.16b |
| eor v9.16b, v9.16b, v14.16b |
| |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| ushr v5.4s, v6.4s, #20 |
| sli v5.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| ushr v7.4s, v8.4s, #20 |
| sli v7.4s, v8.4s, #12 |
| ushr v8.4s, v9.4s, #20 |
| sli v8.4s, v9.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v5.4s |
| add v2.4s, v2.4s, v6.4s |
| add v3.4s, v3.4s, v7.4s |
| add v4.4s, v4.4s, v8.4s |
| |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| eor v18.16b, v18.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| tbl v18.16b, {v18.16b}, v26.16b |
| tbl v19.16b, {v19.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| add v13.4s, v13.4s, v18.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v20.16b, v20.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v6.16b, v6.16b, v12.16b |
| eor v7.16b, v7.16b, v13.16b |
| eor v8.16b, v8.16b, v14.16b |
| |
| ushr v9.4s, v8.4s, #25 |
| sli v9.4s, v8.4s, #7 |
| ushr v8.4s, v7.4s, #25 |
| sli v8.4s, v7.4s, #7 |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v5.4s, #25 |
| sli v6.4s, v5.4s, #7 |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| |
| ext v9.16b, v9.16b, v9.16b, #4 |
| ext v14.16b, v14.16b, v14.16b, #8 |
| ext v19.16b, v19.16b, v19.16b, #12 |
| ldp x11, x12, [x3], 16 |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| add v0.4s, v0.4s, v6.4s |
| add v1.4s, v1.4s, v7.4s |
| add v2.4s, v2.4s, v8.4s |
| add v3.4s, v3.4s, v5.4s |
| add v4.4s, v4.4s, v9.4s |
| |
| eor v18.16b, v18.16b, v0.16b |
| eor v15.16b, v15.16b, v1.16b |
| eor v16.16b, v16.16b, v2.16b |
| eor v17.16b, v17.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| rev32 v18.8h, v18.8h |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| rev32 v19.8h, v19.8h |
| |
| add v12.4s, v12.4s, v18.4s |
| add v13.4s, v13.4s, v15.4s |
| add v10.4s, v10.4s, v16.4s |
| add v11.4s, v11.4s, v17.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v6.16b, v6.16b, v12.16b |
| eor v7.16b, v7.16b, v13.16b |
| eor v8.16b, v8.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v9.16b, v9.16b, v14.16b |
| |
| ushr v20.4s, v6.4s, #20 |
| sli v20.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| ushr v7.4s, v8.4s, #20 |
| sli v7.4s, v8.4s, #12 |
| ushr v8.4s, v5.4s, #20 |
| sli v8.4s, v5.4s, #12 |
| ushr v5.4s, v9.4s, #20 |
| sli v5.4s, v9.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| add v3.4s, v3.4s, v8.4s |
| add v4.4s, v4.4s, v5.4s |
| |
| eor v18.16b, v18.16b, v0.16b |
| eor v15.16b, v15.16b, v1.16b |
| eor v16.16b, v16.16b, v2.16b |
| eor v17.16b, v17.16b, v3.16b |
| eor v19.16b, v19.16b, v4.16b |
| |
| tbl v18.16b, {v18.16b}, v26.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| tbl v19.16b, {v19.16b}, v26.16b |
| |
| add v12.4s, v12.4s, v18.4s |
| add v13.4s, v13.4s, v15.4s |
| add v10.4s, v10.4s, v16.4s |
| add v11.4s, v11.4s, v17.4s |
| add v14.4s, v14.4s, v19.4s |
| |
| eor v20.16b, v20.16b, v12.16b |
| eor v6.16b, v6.16b, v13.16b |
| eor v7.16b, v7.16b, v10.16b |
| eor v8.16b, v8.16b, v11.16b |
| eor v5.16b, v5.16b, v14.16b |
| |
| ushr v9.4s, v5.4s, #25 |
| sli v9.4s, v5.4s, #7 |
| ushr v5.4s, v8.4s, #25 |
| sli v5.4s, v8.4s, #7 |
| ushr v8.4s, v7.4s, #25 |
| sli v8.4s, v7.4s, #7 |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v20.4s, #25 |
| sli v6.4s, v20.4s, #7 |
| |
| ext v9.16b, v9.16b, v9.16b, #12 |
| ext v14.16b, v14.16b, v14.16b, #8 |
| ext v19.16b, v19.16b, v19.16b, #4 |
| subs x7, x7, #1 |
| b.gt Lopen_main_loop_rounds |
| subs x6, x6, #1 |
| b.ge Lopen_main_loop_rounds_short |
| |
| eor v20.16b, v20.16b, v20.16b //zero |
| not v21.16b, v20.16b // -1 |
| sub v21.4s, v25.4s, v21.4s // Add +1 |
| ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) |
| add v19.4s, v19.4s, v20.4s |
| |
| add v15.4s, v15.4s, v25.4s |
| mov x11, #5 |
| dup v20.4s, w11 |
| add v25.4s, v25.4s, v20.4s |
| |
| zip1 v20.4s, v0.4s, v1.4s |
| zip2 v21.4s, v0.4s, v1.4s |
| zip1 v22.4s, v2.4s, v3.4s |
| zip2 v23.4s, v2.4s, v3.4s |
| |
| zip1 v0.2d, v20.2d, v22.2d |
| zip2 v1.2d, v20.2d, v22.2d |
| zip1 v2.2d, v21.2d, v23.2d |
| zip2 v3.2d, v21.2d, v23.2d |
| |
| zip1 v20.4s, v5.4s, v6.4s |
| zip2 v21.4s, v5.4s, v6.4s |
| zip1 v22.4s, v7.4s, v8.4s |
| zip2 v23.4s, v7.4s, v8.4s |
| |
| zip1 v5.2d, v20.2d, v22.2d |
| zip2 v6.2d, v20.2d, v22.2d |
| zip1 v7.2d, v21.2d, v23.2d |
| zip2 v8.2d, v21.2d, v23.2d |
| |
| zip1 v20.4s, v10.4s, v11.4s |
| zip2 v21.4s, v10.4s, v11.4s |
| zip1 v22.4s, v12.4s, v13.4s |
| zip2 v23.4s, v12.4s, v13.4s |
| |
| zip1 v10.2d, v20.2d, v22.2d |
| zip2 v11.2d, v20.2d, v22.2d |
| zip1 v12.2d, v21.2d, v23.2d |
| zip2 v13.2d, v21.2d, v23.2d |
| |
| zip1 v20.4s, v15.4s, v16.4s |
| zip2 v21.4s, v15.4s, v16.4s |
| zip1 v22.4s, v17.4s, v18.4s |
| zip2 v23.4s, v17.4s, v18.4s |
| |
| zip1 v15.2d, v20.2d, v22.2d |
| zip2 v16.2d, v20.2d, v22.2d |
| zip1 v17.2d, v21.2d, v23.2d |
| zip2 v18.2d, v21.2d, v23.2d |
| |
| add v0.4s, v0.4s, v24.4s |
| add v5.4s, v5.4s, v28.4s |
| add v10.4s, v10.4s, v29.4s |
| add v15.4s, v15.4s, v30.4s |
| |
| add v1.4s, v1.4s, v24.4s |
| add v6.4s, v6.4s, v28.4s |
| add v11.4s, v11.4s, v29.4s |
| add v16.4s, v16.4s, v30.4s |
| |
| add v2.4s, v2.4s, v24.4s |
| add v7.4s, v7.4s, v28.4s |
| add v12.4s, v12.4s, v29.4s |
| add v17.4s, v17.4s, v30.4s |
| |
| add v3.4s, v3.4s, v24.4s |
| add v8.4s, v8.4s, v28.4s |
| add v13.4s, v13.4s, v29.4s |
| add v18.4s, v18.4s, v30.4s |
| |
| add v4.4s, v4.4s, v24.4s |
| add v9.4s, v9.4s, v28.4s |
| add v14.4s, v14.4s, v29.4s |
| add v19.4s, v19.4s, v30.4s |
| |
| // We can always safely store 192 bytes |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v0.16b |
| eor v21.16b, v21.16b, v5.16b |
| eor v22.16b, v22.16b, v10.16b |
| eor v23.16b, v23.16b, v15.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v1.16b |
| eor v21.16b, v21.16b, v6.16b |
| eor v22.16b, v22.16b, v11.16b |
| eor v23.16b, v23.16b, v16.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v2.16b |
| eor v21.16b, v21.16b, v7.16b |
| eor v22.16b, v22.16b, v12.16b |
| eor v23.16b, v23.16b, v17.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| sub x2, x2, #192 |
| |
| mov v0.16b, v3.16b |
| mov v5.16b, v8.16b |
| mov v10.16b, v13.16b |
| mov v15.16b, v18.16b |
| |
| cmp x2, #64 |
| b.lt Lopen_tail_64_store |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v3.16b |
| eor v21.16b, v21.16b, v8.16b |
| eor v22.16b, v22.16b, v13.16b |
| eor v23.16b, v23.16b, v18.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| sub x2, x2, #64 |
| |
| mov v0.16b, v4.16b |
| mov v5.16b, v9.16b |
| mov v10.16b, v14.16b |
| mov v15.16b, v19.16b |
| |
| cmp x2, #64 |
| b.lt Lopen_tail_64_store |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| eor v20.16b, v20.16b, v4.16b |
| eor v21.16b, v21.16b, v9.16b |
| eor v22.16b, v22.16b, v14.16b |
| eor v23.16b, v23.16b, v19.16b |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| sub x2, x2, #64 |
| b Lopen_main_loop |
| |
| Lopen_tail: |
| |
| cbz x2, Lopen_finalize |
| |
| lsr x4, x2, #4 // How many whole blocks we have to hash |
| |
| cmp x2, #64 |
| b.le Lopen_tail_64 |
| cmp x2, #128 |
| b.le Lopen_tail_128 |
| |
| Lopen_tail_192: |
| // We need three more blocks |
| mov v0.16b, v24.16b |
| mov v1.16b, v24.16b |
| mov v2.16b, v24.16b |
| mov v5.16b, v28.16b |
| mov v6.16b, v28.16b |
| mov v7.16b, v28.16b |
| mov v10.16b, v29.16b |
| mov v11.16b, v29.16b |
| mov v12.16b, v29.16b |
| mov v15.16b, v30.16b |
| mov v16.16b, v30.16b |
| mov v17.16b, v30.16b |
| eor v23.16b, v23.16b, v23.16b |
| eor v21.16b, v21.16b, v21.16b |
| ins v23.s[0], v25.s[0] |
| ins v21.d[0], x15 |
| |
| add v22.4s, v23.4s, v21.4s |
| add v21.4s, v22.4s, v21.4s |
| |
| add v15.4s, v15.4s, v21.4s |
| add v16.4s, v16.4s, v23.4s |
| add v17.4s, v17.4s, v22.4s |
| |
| mov x7, #10 |
| subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash |
| csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing |
| sub x4, x4, x7 |
| |
| cbz x7, Lopen_tail_192_rounds_no_hash |
| |
| Lopen_tail_192_rounds: |
| ldp x11, x12, [x3], 16 |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| Lopen_tail_192_rounds_no_hash: |
| add v0.4s, v0.4s, v5.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v5.16b, v5.16b, v10.16b |
| eor v6.16b, v6.16b, v11.16b |
| eor v7.16b, v7.16b, v12.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| ushr v5.4s, v6.4s, #20 |
| sli v5.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v5.4s |
| add v2.4s, v2.4s, v6.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v20.16b, v20.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v6.16b, v6.16b, v12.16b |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v5.4s, #25 |
| sli v6.4s, v5.4s, #7 |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| |
| ext v5.16b, v5.16b, v5.16b, #4 |
| ext v6.16b, v6.16b, v6.16b, #4 |
| ext v7.16b, v7.16b, v7.16b, #4 |
| |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v11.16b, v11.16b, v11.16b, #8 |
| ext v12.16b, v12.16b, v12.16b, #8 |
| |
| ext v15.16b, v15.16b, v15.16b, #12 |
| ext v16.16b, v16.16b, v16.16b, #12 |
| ext v17.16b, v17.16b, v17.16b, #12 |
| add v0.4s, v0.4s, v5.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v5.16b, v5.16b, v10.16b |
| eor v6.16b, v6.16b, v11.16b |
| eor v7.16b, v7.16b, v12.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| ushr v5.4s, v6.4s, #20 |
| sli v5.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v5.4s |
| add v2.4s, v2.4s, v6.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v20.16b, v20.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v6.16b, v6.16b, v12.16b |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v5.4s, #25 |
| sli v6.4s, v5.4s, #7 |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| |
| ext v5.16b, v5.16b, v5.16b, #12 |
| ext v6.16b, v6.16b, v6.16b, #12 |
| ext v7.16b, v7.16b, v7.16b, #12 |
| |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v11.16b, v11.16b, v11.16b, #8 |
| ext v12.16b, v12.16b, v12.16b, #8 |
| |
| ext v15.16b, v15.16b, v15.16b, #4 |
| ext v16.16b, v16.16b, v16.16b, #4 |
| ext v17.16b, v17.16b, v17.16b, #4 |
| subs x7, x7, #1 |
| b.gt Lopen_tail_192_rounds |
| subs x6, x6, #1 |
| b.ge Lopen_tail_192_rounds_no_hash |
| |
| // We hashed 160 bytes at most, may still have 32 bytes left |
| Lopen_tail_192_hash: |
| cbz x4, Lopen_tail_192_hash_done |
| ldp x11, x12, [x3], 16 |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| sub x4, x4, #1 |
| b Lopen_tail_192_hash |
| |
| Lopen_tail_192_hash_done: |
| |
| add v0.4s, v0.4s, v24.4s |
| add v1.4s, v1.4s, v24.4s |
| add v2.4s, v2.4s, v24.4s |
| add v5.4s, v5.4s, v28.4s |
| add v6.4s, v6.4s, v28.4s |
| add v7.4s, v7.4s, v28.4s |
| add v10.4s, v10.4s, v29.4s |
| add v11.4s, v11.4s, v29.4s |
| add v12.4s, v12.4s, v29.4s |
| add v15.4s, v15.4s, v30.4s |
| add v16.4s, v16.4s, v30.4s |
| add v17.4s, v17.4s, v30.4s |
| |
| add v15.4s, v15.4s, v21.4s |
| add v16.4s, v16.4s, v23.4s |
| add v17.4s, v17.4s, v22.4s |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| |
| eor v20.16b, v20.16b, v1.16b |
| eor v21.16b, v21.16b, v6.16b |
| eor v22.16b, v22.16b, v11.16b |
| eor v23.16b, v23.16b, v16.16b |
| |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| |
| eor v20.16b, v20.16b, v2.16b |
| eor v21.16b, v21.16b, v7.16b |
| eor v22.16b, v22.16b, v12.16b |
| eor v23.16b, v23.16b, v17.16b |
| |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| sub x2, x2, #128 |
| b Lopen_tail_64_store |
| |
| Lopen_tail_128: |
| // We need two more blocks |
| mov v0.16b, v24.16b |
| mov v1.16b, v24.16b |
| mov v5.16b, v28.16b |
| mov v6.16b, v28.16b |
| mov v10.16b, v29.16b |
| mov v11.16b, v29.16b |
| mov v15.16b, v30.16b |
| mov v16.16b, v30.16b |
| eor v23.16b, v23.16b, v23.16b |
| eor v22.16b, v22.16b, v22.16b |
| ins v23.s[0], v25.s[0] |
| ins v22.d[0], x15 |
| add v22.4s, v22.4s, v23.4s |
| |
| add v15.4s, v15.4s, v22.4s |
| add v16.4s, v16.4s, v23.4s |
| |
| mov x6, #10 |
| sub x6, x6, x4 |
| |
| Lopen_tail_128_rounds: |
| add v0.4s, v0.4s, v5.4s |
| eor v15.16b, v15.16b, v0.16b |
| rev32 v15.8h, v15.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v5.16b, v5.16b, v10.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| add v0.4s, v0.4s, v20.4s |
| eor v15.16b, v15.16b, v0.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v20.16b, v20.16b, v10.16b |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| ext v5.16b, v5.16b, v5.16b, #4 |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v15.16b, v15.16b, v15.16b, #12 |
| add v1.4s, v1.4s, v6.4s |
| eor v16.16b, v16.16b, v1.16b |
| rev32 v16.8h, v16.8h |
| |
| add v11.4s, v11.4s, v16.4s |
| eor v6.16b, v6.16b, v11.16b |
| ushr v20.4s, v6.4s, #20 |
| sli v20.4s, v6.4s, #12 |
| add v1.4s, v1.4s, v20.4s |
| eor v16.16b, v16.16b, v1.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| |
| add v11.4s, v11.4s, v16.4s |
| eor v20.16b, v20.16b, v11.16b |
| ushr v6.4s, v20.4s, #25 |
| sli v6.4s, v20.4s, #7 |
| ext v6.16b, v6.16b, v6.16b, #4 |
| ext v11.16b, v11.16b, v11.16b, #8 |
| ext v16.16b, v16.16b, v16.16b, #12 |
| add v0.4s, v0.4s, v5.4s |
| eor v15.16b, v15.16b, v0.16b |
| rev32 v15.8h, v15.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v5.16b, v5.16b, v10.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| add v0.4s, v0.4s, v20.4s |
| eor v15.16b, v15.16b, v0.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v20.16b, v20.16b, v10.16b |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| ext v5.16b, v5.16b, v5.16b, #12 |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v15.16b, v15.16b, v15.16b, #4 |
| add v1.4s, v1.4s, v6.4s |
| eor v16.16b, v16.16b, v1.16b |
| rev32 v16.8h, v16.8h |
| |
| add v11.4s, v11.4s, v16.4s |
| eor v6.16b, v6.16b, v11.16b |
| ushr v20.4s, v6.4s, #20 |
| sli v20.4s, v6.4s, #12 |
| add v1.4s, v1.4s, v20.4s |
| eor v16.16b, v16.16b, v1.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| |
| add v11.4s, v11.4s, v16.4s |
| eor v20.16b, v20.16b, v11.16b |
| ushr v6.4s, v20.4s, #25 |
| sli v6.4s, v20.4s, #7 |
| ext v6.16b, v6.16b, v6.16b, #12 |
| ext v11.16b, v11.16b, v11.16b, #8 |
| ext v16.16b, v16.16b, v16.16b, #4 |
| subs x6, x6, #1 |
| b.gt Lopen_tail_128_rounds |
| cbz x4, Lopen_tail_128_rounds_done |
| subs x4, x4, #1 |
| ldp x11, x12, [x3], 16 |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| b Lopen_tail_128_rounds |
| |
| Lopen_tail_128_rounds_done: |
| add v0.4s, v0.4s, v24.4s |
| add v1.4s, v1.4s, v24.4s |
| add v5.4s, v5.4s, v28.4s |
| add v6.4s, v6.4s, v28.4s |
| add v10.4s, v10.4s, v29.4s |
| add v11.4s, v11.4s, v29.4s |
| add v15.4s, v15.4s, v30.4s |
| add v16.4s, v16.4s, v30.4s |
| add v15.4s, v15.4s, v22.4s |
| add v16.4s, v16.4s, v23.4s |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| |
| eor v20.16b, v20.16b, v1.16b |
| eor v21.16b, v21.16b, v6.16b |
| eor v22.16b, v22.16b, v11.16b |
| eor v23.16b, v23.16b, v16.16b |
| |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| sub x2, x2, #64 |
| |
| b Lopen_tail_64_store |
| |
| Lopen_tail_64: |
| // We just need a single block |
| mov v0.16b, v24.16b |
| mov v5.16b, v28.16b |
| mov v10.16b, v29.16b |
| mov v15.16b, v30.16b |
| eor v23.16b, v23.16b, v23.16b |
| ins v23.s[0], v25.s[0] |
| add v15.4s, v15.4s, v23.4s |
| |
| mov x6, #10 |
| sub x6, x6, x4 |
| |
| Lopen_tail_64_rounds: |
| add v0.4s, v0.4s, v5.4s |
| eor v15.16b, v15.16b, v0.16b |
| rev32 v15.8h, v15.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v5.16b, v5.16b, v10.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| add v0.4s, v0.4s, v20.4s |
| eor v15.16b, v15.16b, v0.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v20.16b, v20.16b, v10.16b |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| ext v5.16b, v5.16b, v5.16b, #4 |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v15.16b, v15.16b, v15.16b, #12 |
| add v0.4s, v0.4s, v5.4s |
| eor v15.16b, v15.16b, v0.16b |
| rev32 v15.8h, v15.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v5.16b, v5.16b, v10.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| add v0.4s, v0.4s, v20.4s |
| eor v15.16b, v15.16b, v0.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| eor v20.16b, v20.16b, v10.16b |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| ext v5.16b, v5.16b, v5.16b, #12 |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v15.16b, v15.16b, v15.16b, #4 |
| subs x6, x6, #1 |
| b.gt Lopen_tail_64_rounds |
| cbz x4, Lopen_tail_64_rounds_done |
| subs x4, x4, #1 |
| ldp x11, x12, [x3], 16 |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| b Lopen_tail_64_rounds |
| |
| Lopen_tail_64_rounds_done: |
| add v0.4s, v0.4s, v24.4s |
| add v5.4s, v5.4s, v28.4s |
| add v10.4s, v10.4s, v29.4s |
| add v15.4s, v15.4s, v30.4s |
| add v15.4s, v15.4s, v23.4s |
| |
| Lopen_tail_64_store: |
| cmp x2, #16 |
| b.lt Lopen_tail_16 |
| |
| ld1 {v20.16b}, [x1], #16 |
| eor v20.16b, v20.16b, v0.16b |
| st1 {v20.16b}, [x0], #16 |
| mov v0.16b, v5.16b |
| mov v5.16b, v10.16b |
| mov v10.16b, v15.16b |
| sub x2, x2, #16 |
| b Lopen_tail_64_store |
| |
| Lopen_tail_16: |
| // Here we handle the last [0,16) bytes that require a padded block |
| cbz x2, Lopen_finalize |
| |
| eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext |
| eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask |
| not v22.16b, v20.16b |
| |
| add x7, x1, x2 |
| mov x6, x2 |
| |
| Lopen_tail_16_compose: |
| ext v20.16b, v20.16b, v20.16b, #15 |
| ldrb w11, [x7, #-1]! |
| mov v20.b[0], w11 |
| ext v21.16b, v22.16b, v21.16b, #15 |
| subs x2, x2, #1 |
| b.gt Lopen_tail_16_compose |
| |
| and v20.16b, v20.16b, v21.16b |
| // Hash in the final padded block |
| mov x11, v20.d[0] |
| mov x12, v20.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| eor v20.16b, v20.16b, v0.16b |
| |
| Lopen_tail_16_store: |
| umov w11, v20.b[0] |
| strb w11, [x0], #1 |
| ext v20.16b, v20.16b, v20.16b, #1 |
| subs x6, x6, #1 |
| b.gt Lopen_tail_16_store |
| |
| Lopen_finalize: |
| mov x11, v31.d[0] |
| mov x12, v31.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| // Final reduction step |
| sub x12, xzr, x15 |
| orr x13, xzr, #3 |
| subs x11, x8, #-5 |
| sbcs x12, x9, x12 |
| sbcs x13, x10, x13 |
| csel x8, x11, x8, cs |
| csel x9, x12, x9, cs |
| csel x10, x13, x10, cs |
| mov x11, v27.d[0] |
| mov x12, v27.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| |
| stp x8, x9, [x5] |
| |
| ldp d8, d9, [sp, #16] |
| ldp d10, d11, [sp, #32] |
| ldp d12, d13, [sp, #48] |
| ldp d14, d15, [sp, #64] |
| .cfi_restore b15 |
| .cfi_restore b14 |
| .cfi_restore b13 |
| .cfi_restore b12 |
| .cfi_restore b11 |
| .cfi_restore b10 |
| .cfi_restore b9 |
| .cfi_restore b8 |
| ldp x29, x30, [sp], 80 |
| .cfi_restore w29 |
| .cfi_restore w30 |
| .cfi_def_cfa_offset 0 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| Lopen_128: |
| // On some architectures preparing 5 blocks for small buffers is wasteful |
| eor v25.16b, v25.16b, v25.16b |
| mov x11, #1 |
| mov v25.s[0], w11 |
| mov v0.16b, v24.16b |
| mov v1.16b, v24.16b |
| mov v2.16b, v24.16b |
| mov v5.16b, v28.16b |
| mov v6.16b, v28.16b |
| mov v7.16b, v28.16b |
| mov v10.16b, v29.16b |
| mov v11.16b, v29.16b |
| mov v12.16b, v29.16b |
| mov v17.16b, v30.16b |
| add v15.4s, v17.4s, v25.4s |
| add v16.4s, v15.4s, v25.4s |
| |
| mov x6, #10 |
| |
| Lopen_128_rounds: |
| add v0.4s, v0.4s, v5.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v5.16b, v5.16b, v10.16b |
| eor v6.16b, v6.16b, v11.16b |
| eor v7.16b, v7.16b, v12.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| ushr v5.4s, v6.4s, #20 |
| sli v5.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v5.4s |
| add v2.4s, v2.4s, v6.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v20.16b, v20.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v6.16b, v6.16b, v12.16b |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v5.4s, #25 |
| sli v6.4s, v5.4s, #7 |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| |
| ext v5.16b, v5.16b, v5.16b, #4 |
| ext v6.16b, v6.16b, v6.16b, #4 |
| ext v7.16b, v7.16b, v7.16b, #4 |
| |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v11.16b, v11.16b, v11.16b, #8 |
| ext v12.16b, v12.16b, v12.16b, #8 |
| |
| ext v15.16b, v15.16b, v15.16b, #12 |
| ext v16.16b, v16.16b, v16.16b, #12 |
| ext v17.16b, v17.16b, v17.16b, #12 |
| add v0.4s, v0.4s, v5.4s |
| add v1.4s, v1.4s, v6.4s |
| add v2.4s, v2.4s, v7.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| rev32 v15.8h, v15.8h |
| rev32 v16.8h, v16.8h |
| rev32 v17.8h, v17.8h |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v5.16b, v5.16b, v10.16b |
| eor v6.16b, v6.16b, v11.16b |
| eor v7.16b, v7.16b, v12.16b |
| ushr v20.4s, v5.4s, #20 |
| sli v20.4s, v5.4s, #12 |
| ushr v5.4s, v6.4s, #20 |
| sli v5.4s, v6.4s, #12 |
| ushr v6.4s, v7.4s, #20 |
| sli v6.4s, v7.4s, #12 |
| |
| add v0.4s, v0.4s, v20.4s |
| add v1.4s, v1.4s, v5.4s |
| add v2.4s, v2.4s, v6.4s |
| eor v15.16b, v15.16b, v0.16b |
| eor v16.16b, v16.16b, v1.16b |
| eor v17.16b, v17.16b, v2.16b |
| tbl v15.16b, {v15.16b}, v26.16b |
| tbl v16.16b, {v16.16b}, v26.16b |
| tbl v17.16b, {v17.16b}, v26.16b |
| |
| add v10.4s, v10.4s, v15.4s |
| add v11.4s, v11.4s, v16.4s |
| add v12.4s, v12.4s, v17.4s |
| eor v20.16b, v20.16b, v10.16b |
| eor v5.16b, v5.16b, v11.16b |
| eor v6.16b, v6.16b, v12.16b |
| ushr v7.4s, v6.4s, #25 |
| sli v7.4s, v6.4s, #7 |
| ushr v6.4s, v5.4s, #25 |
| sli v6.4s, v5.4s, #7 |
| ushr v5.4s, v20.4s, #25 |
| sli v5.4s, v20.4s, #7 |
| |
| ext v5.16b, v5.16b, v5.16b, #12 |
| ext v6.16b, v6.16b, v6.16b, #12 |
| ext v7.16b, v7.16b, v7.16b, #12 |
| |
| ext v10.16b, v10.16b, v10.16b, #8 |
| ext v11.16b, v11.16b, v11.16b, #8 |
| ext v12.16b, v12.16b, v12.16b, #8 |
| |
| ext v15.16b, v15.16b, v15.16b, #4 |
| ext v16.16b, v16.16b, v16.16b, #4 |
| ext v17.16b, v17.16b, v17.16b, #4 |
| subs x6, x6, #1 |
| b.hi Lopen_128_rounds |
| |
| add v0.4s, v0.4s, v24.4s |
| add v1.4s, v1.4s, v24.4s |
| add v2.4s, v2.4s, v24.4s |
| |
| add v5.4s, v5.4s, v28.4s |
| add v6.4s, v6.4s, v28.4s |
| add v7.4s, v7.4s, v28.4s |
| |
| add v10.4s, v10.4s, v29.4s |
| add v11.4s, v11.4s, v29.4s |
| |
| add v30.4s, v30.4s, v25.4s |
| add v15.4s, v15.4s, v30.4s |
| add v30.4s, v30.4s, v25.4s |
| add v16.4s, v16.4s, v30.4s |
| |
| and v2.16b, v2.16b, v27.16b |
| mov x16, v2.d[0] // Move the R key to GPRs |
| mov x17, v2.d[1] |
| mov v27.16b, v7.16b // Store the S key |
| |
| bl Lpoly_hash_ad_internal |
| |
| Lopen_128_store: |
| cmp x2, #64 |
| b.lt Lopen_128_store_64 |
| |
| ld1 {v20.16b - v23.16b}, [x1], #64 |
| |
| mov x11, v20.d[0] |
| mov x12, v20.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| mov x11, v21.d[0] |
| mov x12, v21.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| mov x11, v22.d[0] |
| mov x12, v22.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| mov x11, v23.d[0] |
| mov x12, v23.d[1] |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| |
| eor v20.16b, v20.16b, v0.16b |
| eor v21.16b, v21.16b, v5.16b |
| eor v22.16b, v22.16b, v10.16b |
| eor v23.16b, v23.16b, v15.16b |
| |
| st1 {v20.16b - v23.16b}, [x0], #64 |
| |
| sub x2, x2, #64 |
| |
| mov v0.16b, v1.16b |
| mov v5.16b, v6.16b |
| mov v10.16b, v11.16b |
| mov v15.16b, v16.16b |
| |
| Lopen_128_store_64: |
| |
| lsr x4, x2, #4 |
| mov x3, x1 |
| |
| Lopen_128_hash_64: |
| cbz x4, Lopen_tail_64_store |
| ldp x11, x12, [x3], 16 |
| adds x8, x8, x11 |
| adcs x9, x9, x12 |
| adc x10, x10, x15 |
| mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 |
| umulh x12, x8, x16 |
| mul x13, x9, x16 |
| umulh x14, x9, x16 |
| adds x12, x12, x13 |
| mul x13, x10, x16 |
| adc x13, x13, x14 |
| mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] |
| umulh x8, x8, x17 |
| adds x12, x12, x14 |
| mul x14, x9, x17 |
| umulh x9, x9, x17 |
| adcs x14, x14, x8 |
| mul x10, x10, x17 |
| adc x10, x10, x9 |
| adds x13, x13, x14 |
| adc x14, x10, xzr |
| and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) |
| and x8, x13, #-4 |
| extr x13, x14, x13, #2 |
| adds x8, x8, x11 |
| lsr x11, x14, #2 |
| adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits |
| adds x8, x8, x13 |
| adcs x9, x9, x12 |
| adc x10, x10, xzr // At this point acc2 has the value of 4 at most |
| sub x4, x4, #1 |
| b Lopen_128_hash_64 |
| .cfi_endproc |
| |
| #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) |