| // This file is generated from a similarly-named Perl script in the BoringSSL |
| // source tree. Do not edit by hand. |
| |
| #include <openssl/asm_base.h> |
| |
| #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) |
| .section .rodata |
| .align 64 |
| chacha20_poly1305_constants: |
| .Lchacha20_consts: |
| .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' |
| .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' |
| .Lrol8: |
| .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 |
| .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 |
| .Lrol16: |
| .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 |
| .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 |
| .Lavx2_init: |
| .long 0,0,0,0 |
| .Lsse_inc: |
| .long 1,0,0,0 |
| .Lavx2_inc: |
| .long 2,0,0,0,2,0,0,0 |
| .Lclamp: |
| .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC |
| .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF |
| .align 16 |
| .Land_masks: |
| .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 |
| .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff |
| .text |
| |
| .type poly_hash_ad_internal,@function |
| .align 64 |
| poly_hash_ad_internal: |
| .cfi_startproc |
| .cfi_def_cfa rsp, 8 |
| xorq %r10,%r10 |
| xorq %r11,%r11 |
| xorq %r12,%r12 |
| cmpq $13,%r8 |
| jne .Lhash_ad_loop |
| .Lpoly_fast_tls_ad: |
| |
| movq (%rcx),%r10 |
| movq 5(%rcx),%r11 |
| shrq $24,%r11 |
| movq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| ret |
| .Lhash_ad_loop: |
| |
| cmpq $16,%r8 |
| jb .Lhash_ad_tail |
| addq 0+0(%rcx),%r10 |
| adcq 8+0(%rcx),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rcx),%rcx |
| subq $16,%r8 |
| jmp .Lhash_ad_loop |
| .Lhash_ad_tail: |
| cmpq $0,%r8 |
| je .Lhash_ad_done |
| |
| xorq %r13,%r13 |
| xorq %r14,%r14 |
| xorq %r15,%r15 |
| addq %r8,%rcx |
| .Lhash_ad_tail_loop: |
| shldq $8,%r13,%r14 |
| shlq $8,%r13 |
| movzbq -1(%rcx),%r15 |
| xorq %r15,%r13 |
| decq %rcx |
| decq %r8 |
| jne .Lhash_ad_tail_loop |
| |
| addq %r13,%r10 |
| adcq %r14,%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| |
| .Lhash_ad_done: |
| ret |
| .cfi_endproc |
| .size poly_hash_ad_internal, .-poly_hash_ad_internal |
| |
| .globl chacha20_poly1305_open_nohw |
| .hidden chacha20_poly1305_open_nohw |
| .type chacha20_poly1305_open_nohw,@function |
| .align 64 |
| chacha20_poly1305_open_nohw: |
| .cfi_startproc |
| _CET_ENDBR |
| pushq %rbp |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %rbp,-16 |
| pushq %rbx |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %rbx,-24 |
| pushq %r12 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r12,-32 |
| pushq %r13 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r13,-40 |
| pushq %r14 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r14,-48 |
| pushq %r15 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r15,-56 |
| |
| |
| pushq %r9 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r9,-64 |
| subq $288 + 0 + 32,%rsp |
| .cfi_adjust_cfa_offset 288 + 32 |
| |
| leaq 32(%rsp),%rbp |
| andq $-32,%rbp |
| |
| movq %rdx,%rbx |
| movq %r8,0+0+32(%rbp) |
| movq %rbx,8+0+32(%rbp) |
| |
| cmpq $128,%rbx |
| jbe .Lopen_sse_128 |
| |
| movdqa .Lchacha20_consts(%rip),%xmm0 |
| movdqu 0(%r9),%xmm4 |
| movdqu 16(%r9),%xmm8 |
| movdqu 32(%r9),%xmm12 |
| |
| movdqa %xmm12,%xmm7 |
| |
| movdqa %xmm4,0+48(%rbp) |
| movdqa %xmm8,0+64(%rbp) |
| movdqa %xmm12,0+96(%rbp) |
| movq $10,%r10 |
| .Lopen_sse_init_rounds: |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| |
| decq %r10 |
| jne .Lopen_sse_init_rounds |
| |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd 0+48(%rbp),%xmm4 |
| |
| pand .Lclamp(%rip),%xmm0 |
| movdqa %xmm0,0+0(%rbp) |
| movdqa %xmm4,0+16(%rbp) |
| |
| movq %r8,%r8 |
| call poly_hash_ad_internal |
| .Lopen_sse_main_loop: |
| cmpq $256,%rbx |
| jb .Lopen_sse_tail |
| |
| movdqa .Lchacha20_consts(%rip),%xmm0 |
| movdqa 0+48(%rbp),%xmm4 |
| movdqa 0+64(%rbp),%xmm8 |
| movdqa %xmm0,%xmm1 |
| movdqa %xmm4,%xmm5 |
| movdqa %xmm8,%xmm9 |
| movdqa %xmm0,%xmm2 |
| movdqa %xmm4,%xmm6 |
| movdqa %xmm8,%xmm10 |
| movdqa %xmm0,%xmm3 |
| movdqa %xmm4,%xmm7 |
| movdqa %xmm8,%xmm11 |
| movdqa 0+96(%rbp),%xmm15 |
| paddd .Lsse_inc(%rip),%xmm15 |
| movdqa %xmm15,%xmm14 |
| paddd .Lsse_inc(%rip),%xmm14 |
| movdqa %xmm14,%xmm13 |
| paddd .Lsse_inc(%rip),%xmm13 |
| movdqa %xmm13,%xmm12 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,0+96(%rbp) |
| movdqa %xmm13,0+112(%rbp) |
| movdqa %xmm14,0+128(%rbp) |
| movdqa %xmm15,0+144(%rbp) |
| |
| |
| |
| movq $4,%rcx |
| movq %rsi,%r8 |
| .Lopen_sse_main_loop_rounds: |
| movdqa %xmm8,0+80(%rbp) |
| movdqa .Lrol16(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| addq 0+0(%r8),%r10 |
| adcq 8+0(%r8),%r11 |
| adcq $1,%r12 |
| |
| leaq 16(%r8),%r8 |
| pxor %xmm10,%xmm6 |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm4 |
| pxor %xmm8,%xmm4 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movdqa .Lrol8(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| pxor %xmm10,%xmm6 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm4 |
| pxor %xmm8,%xmm4 |
| movdqa 0+80(%rbp),%xmm8 |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| .byte 102,15,58,15,255,4 |
| .byte 102,69,15,58,15,219,8 |
| .byte 102,69,15,58,15,255,12 |
| .byte 102,15,58,15,246,4 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,12 |
| .byte 102,15,58,15,237,4 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,12 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa .Lrol16(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| pxor %xmm10,%xmm6 |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm4 |
| pxor %xmm8,%xmm4 |
| movdqa .Lrol8(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| pxor %xmm10,%xmm6 |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm4 |
| pxor %xmm8,%xmm4 |
| movdqa 0+80(%rbp),%xmm8 |
| .byte 102,15,58,15,255,12 |
| .byte 102,69,15,58,15,219,8 |
| .byte 102,69,15,58,15,255,4 |
| .byte 102,15,58,15,246,12 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,4 |
| .byte 102,15,58,15,237,12 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| |
| decq %rcx |
| jge .Lopen_sse_main_loop_rounds |
| addq 0+0(%r8),%r10 |
| adcq 8+0(%r8),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%r8),%r8 |
| cmpq $-6,%rcx |
| jg .Lopen_sse_main_loop_rounds |
| paddd .Lchacha20_consts(%rip),%xmm3 |
| paddd 0+48(%rbp),%xmm7 |
| paddd 0+64(%rbp),%xmm11 |
| paddd 0+144(%rbp),%xmm15 |
| paddd .Lchacha20_consts(%rip),%xmm2 |
| paddd 0+48(%rbp),%xmm6 |
| paddd 0+64(%rbp),%xmm10 |
| paddd 0+128(%rbp),%xmm14 |
| paddd .Lchacha20_consts(%rip),%xmm1 |
| paddd 0+48(%rbp),%xmm5 |
| paddd 0+64(%rbp),%xmm9 |
| paddd 0+112(%rbp),%xmm13 |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd 0+48(%rbp),%xmm4 |
| paddd 0+64(%rbp),%xmm8 |
| paddd 0+96(%rbp),%xmm12 |
| movdqa %xmm12,0+80(%rbp) |
| movdqu 0 + 0(%rsi),%xmm12 |
| pxor %xmm3,%xmm12 |
| movdqu %xmm12,0 + 0(%rdi) |
| movdqu 16 + 0(%rsi),%xmm12 |
| pxor %xmm7,%xmm12 |
| movdqu %xmm12,16 + 0(%rdi) |
| movdqu 32 + 0(%rsi),%xmm12 |
| pxor %xmm11,%xmm12 |
| movdqu %xmm12,32 + 0(%rdi) |
| movdqu 48 + 0(%rsi),%xmm12 |
| pxor %xmm15,%xmm12 |
| movdqu %xmm12,48 + 0(%rdi) |
| movdqu 0 + 64(%rsi),%xmm3 |
| movdqu 16 + 64(%rsi),%xmm7 |
| movdqu 32 + 64(%rsi),%xmm11 |
| movdqu 48 + 64(%rsi),%xmm15 |
| pxor %xmm3,%xmm2 |
| pxor %xmm7,%xmm6 |
| pxor %xmm11,%xmm10 |
| pxor %xmm14,%xmm15 |
| movdqu %xmm2,0 + 64(%rdi) |
| movdqu %xmm6,16 + 64(%rdi) |
| movdqu %xmm10,32 + 64(%rdi) |
| movdqu %xmm15,48 + 64(%rdi) |
| movdqu 0 + 128(%rsi),%xmm3 |
| movdqu 16 + 128(%rsi),%xmm7 |
| movdqu 32 + 128(%rsi),%xmm11 |
| movdqu 48 + 128(%rsi),%xmm15 |
| pxor %xmm3,%xmm1 |
| pxor %xmm7,%xmm5 |
| pxor %xmm11,%xmm9 |
| pxor %xmm13,%xmm15 |
| movdqu %xmm1,0 + 128(%rdi) |
| movdqu %xmm5,16 + 128(%rdi) |
| movdqu %xmm9,32 + 128(%rdi) |
| movdqu %xmm15,48 + 128(%rdi) |
| movdqu 0 + 192(%rsi),%xmm3 |
| movdqu 16 + 192(%rsi),%xmm7 |
| movdqu 32 + 192(%rsi),%xmm11 |
| movdqu 48 + 192(%rsi),%xmm15 |
| pxor %xmm3,%xmm0 |
| pxor %xmm7,%xmm4 |
| pxor %xmm11,%xmm8 |
| pxor 0+80(%rbp),%xmm15 |
| movdqu %xmm0,0 + 192(%rdi) |
| movdqu %xmm4,16 + 192(%rdi) |
| movdqu %xmm8,32 + 192(%rdi) |
| movdqu %xmm15,48 + 192(%rdi) |
| |
| leaq 256(%rsi),%rsi |
| leaq 256(%rdi),%rdi |
| subq $256,%rbx |
| jmp .Lopen_sse_main_loop |
| .Lopen_sse_tail: |
| |
| testq %rbx,%rbx |
| jz .Lopen_sse_finalize |
| cmpq $192,%rbx |
| ja .Lopen_sse_tail_256 |
| cmpq $128,%rbx |
| ja .Lopen_sse_tail_192 |
| cmpq $64,%rbx |
| ja .Lopen_sse_tail_128 |
| movdqa .Lchacha20_consts(%rip),%xmm0 |
| movdqa 0+48(%rbp),%xmm4 |
| movdqa 0+64(%rbp),%xmm8 |
| movdqa 0+96(%rbp),%xmm12 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,0+96(%rbp) |
| |
| xorq %r8,%r8 |
| movq %rbx,%rcx |
| cmpq $16,%rcx |
| jb .Lopen_sse_tail_64_rounds |
| .Lopen_sse_tail_64_rounds_and_x1hash: |
| addq 0+0(%rsi,%r8,1),%r10 |
| adcq 8+0(%rsi,%r8,1),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| subq $16,%rcx |
| .Lopen_sse_tail_64_rounds: |
| addq $16,%r8 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| |
| cmpq $16,%rcx |
| jae .Lopen_sse_tail_64_rounds_and_x1hash |
| cmpq $160,%r8 |
| jne .Lopen_sse_tail_64_rounds |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd 0+48(%rbp),%xmm4 |
| paddd 0+64(%rbp),%xmm8 |
| paddd 0+96(%rbp),%xmm12 |
| |
| jmp .Lopen_sse_tail_64_dec_loop |
| |
| .Lopen_sse_tail_128: |
| movdqa .Lchacha20_consts(%rip),%xmm0 |
| movdqa 0+48(%rbp),%xmm4 |
| movdqa 0+64(%rbp),%xmm8 |
| movdqa %xmm0,%xmm1 |
| movdqa %xmm4,%xmm5 |
| movdqa %xmm8,%xmm9 |
| movdqa 0+96(%rbp),%xmm13 |
| paddd .Lsse_inc(%rip),%xmm13 |
| movdqa %xmm13,%xmm12 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,0+96(%rbp) |
| movdqa %xmm13,0+112(%rbp) |
| |
| movq %rbx,%rcx |
| andq $-16,%rcx |
| xorq %r8,%r8 |
| .Lopen_sse_tail_128_rounds_and_x1hash: |
| addq 0+0(%rsi,%r8,1),%r10 |
| adcq 8+0(%rsi,%r8,1),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| .Lopen_sse_tail_128_rounds: |
| addq $16,%r8 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,4 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,12 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,12 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,4 |
| |
| cmpq %rcx,%r8 |
| jb .Lopen_sse_tail_128_rounds_and_x1hash |
| cmpq $160,%r8 |
| jne .Lopen_sse_tail_128_rounds |
| paddd .Lchacha20_consts(%rip),%xmm1 |
| paddd 0+48(%rbp),%xmm5 |
| paddd 0+64(%rbp),%xmm9 |
| paddd 0+112(%rbp),%xmm13 |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd 0+48(%rbp),%xmm4 |
| paddd 0+64(%rbp),%xmm8 |
| paddd 0+96(%rbp),%xmm12 |
| movdqu 0 + 0(%rsi),%xmm3 |
| movdqu 16 + 0(%rsi),%xmm7 |
| movdqu 32 + 0(%rsi),%xmm11 |
| movdqu 48 + 0(%rsi),%xmm15 |
| pxor %xmm3,%xmm1 |
| pxor %xmm7,%xmm5 |
| pxor %xmm11,%xmm9 |
| pxor %xmm13,%xmm15 |
| movdqu %xmm1,0 + 0(%rdi) |
| movdqu %xmm5,16 + 0(%rdi) |
| movdqu %xmm9,32 + 0(%rdi) |
| movdqu %xmm15,48 + 0(%rdi) |
| |
| subq $64,%rbx |
| leaq 64(%rsi),%rsi |
| leaq 64(%rdi),%rdi |
| jmp .Lopen_sse_tail_64_dec_loop |
| |
| .Lopen_sse_tail_192: |
| movdqa .Lchacha20_consts(%rip),%xmm0 |
| movdqa 0+48(%rbp),%xmm4 |
| movdqa 0+64(%rbp),%xmm8 |
| movdqa %xmm0,%xmm1 |
| movdqa %xmm4,%xmm5 |
| movdqa %xmm8,%xmm9 |
| movdqa %xmm0,%xmm2 |
| movdqa %xmm4,%xmm6 |
| movdqa %xmm8,%xmm10 |
| movdqa 0+96(%rbp),%xmm14 |
| paddd .Lsse_inc(%rip),%xmm14 |
| movdqa %xmm14,%xmm13 |
| paddd .Lsse_inc(%rip),%xmm13 |
| movdqa %xmm13,%xmm12 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,0+96(%rbp) |
| movdqa %xmm13,0+112(%rbp) |
| movdqa %xmm14,0+128(%rbp) |
| |
| movq %rbx,%rcx |
| movq $160,%r8 |
| cmpq $160,%rcx |
| cmovgq %r8,%rcx |
| andq $-16,%rcx |
| xorq %r8,%r8 |
| .Lopen_sse_tail_192_rounds_and_x1hash: |
| addq 0+0(%rsi,%r8,1),%r10 |
| adcq 8+0(%rsi,%r8,1),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| .Lopen_sse_tail_192_rounds: |
| addq $16,%r8 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,4 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,12 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol16(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm6 |
| pxor %xmm3,%xmm6 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol8(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm6 |
| pxor %xmm3,%xmm6 |
| .byte 102,15,58,15,246,4 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,12 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,12 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,4 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol16(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm6 |
| pxor %xmm3,%xmm6 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol8(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm6 |
| pxor %xmm3,%xmm6 |
| .byte 102,15,58,15,246,12 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,4 |
| |
| cmpq %rcx,%r8 |
| jb .Lopen_sse_tail_192_rounds_and_x1hash |
| cmpq $160,%r8 |
| jne .Lopen_sse_tail_192_rounds |
| cmpq $176,%rbx |
| jb .Lopen_sse_tail_192_finish |
| addq 0+160(%rsi),%r10 |
| adcq 8+160(%rsi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| cmpq $192,%rbx |
| jb .Lopen_sse_tail_192_finish |
| addq 0+176(%rsi),%r10 |
| adcq 8+176(%rsi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| .Lopen_sse_tail_192_finish: |
| paddd .Lchacha20_consts(%rip),%xmm2 |
| paddd 0+48(%rbp),%xmm6 |
| paddd 0+64(%rbp),%xmm10 |
| paddd 0+128(%rbp),%xmm14 |
| paddd .Lchacha20_consts(%rip),%xmm1 |
| paddd 0+48(%rbp),%xmm5 |
| paddd 0+64(%rbp),%xmm9 |
| paddd 0+112(%rbp),%xmm13 |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd 0+48(%rbp),%xmm4 |
| paddd 0+64(%rbp),%xmm8 |
| paddd 0+96(%rbp),%xmm12 |
| movdqu 0 + 0(%rsi),%xmm3 |
| movdqu 16 + 0(%rsi),%xmm7 |
| movdqu 32 + 0(%rsi),%xmm11 |
| movdqu 48 + 0(%rsi),%xmm15 |
| pxor %xmm3,%xmm2 |
| pxor %xmm7,%xmm6 |
| pxor %xmm11,%xmm10 |
| pxor %xmm14,%xmm15 |
| movdqu %xmm2,0 + 0(%rdi) |
| movdqu %xmm6,16 + 0(%rdi) |
| movdqu %xmm10,32 + 0(%rdi) |
| movdqu %xmm15,48 + 0(%rdi) |
| movdqu 0 + 64(%rsi),%xmm3 |
| movdqu 16 + 64(%rsi),%xmm7 |
| movdqu 32 + 64(%rsi),%xmm11 |
| movdqu 48 + 64(%rsi),%xmm15 |
| pxor %xmm3,%xmm1 |
| pxor %xmm7,%xmm5 |
| pxor %xmm11,%xmm9 |
| pxor %xmm13,%xmm15 |
| movdqu %xmm1,0 + 64(%rdi) |
| movdqu %xmm5,16 + 64(%rdi) |
| movdqu %xmm9,32 + 64(%rdi) |
| movdqu %xmm15,48 + 64(%rdi) |
| |
| subq $128,%rbx |
| leaq 128(%rsi),%rsi |
| leaq 128(%rdi),%rdi |
| jmp .Lopen_sse_tail_64_dec_loop |
| |
| .Lopen_sse_tail_256: |
| movdqa .Lchacha20_consts(%rip),%xmm0 |
| movdqa 0+48(%rbp),%xmm4 |
| movdqa 0+64(%rbp),%xmm8 |
| movdqa %xmm0,%xmm1 |
| movdqa %xmm4,%xmm5 |
| movdqa %xmm8,%xmm9 |
| movdqa %xmm0,%xmm2 |
| movdqa %xmm4,%xmm6 |
| movdqa %xmm8,%xmm10 |
| movdqa %xmm0,%xmm3 |
| movdqa %xmm4,%xmm7 |
| movdqa %xmm8,%xmm11 |
| movdqa 0+96(%rbp),%xmm15 |
| paddd .Lsse_inc(%rip),%xmm15 |
| movdqa %xmm15,%xmm14 |
| paddd .Lsse_inc(%rip),%xmm14 |
| movdqa %xmm14,%xmm13 |
| paddd .Lsse_inc(%rip),%xmm13 |
| movdqa %xmm13,%xmm12 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,0+96(%rbp) |
| movdqa %xmm13,0+112(%rbp) |
| movdqa %xmm14,0+128(%rbp) |
| movdqa %xmm15,0+144(%rbp) |
| |
| xorq %r8,%r8 |
| .Lopen_sse_tail_256_rounds_and_x1hash: |
| addq 0+0(%rsi,%r8,1),%r10 |
| adcq 8+0(%rsi,%r8,1),%r11 |
| adcq $1,%r12 |
| movdqa %xmm11,0+80(%rbp) |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm11 |
| pslld $12,%xmm11 |
| psrld $20,%xmm4 |
| pxor %xmm11,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm11 |
| pslld $7,%xmm11 |
| psrld $25,%xmm4 |
| pxor %xmm11,%xmm4 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm11 |
| pslld $12,%xmm11 |
| psrld $20,%xmm5 |
| pxor %xmm11,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm11 |
| pslld $7,%xmm11 |
| psrld $25,%xmm5 |
| pxor %xmm11,%xmm5 |
| .byte 102,15,58,15,237,4 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,12 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol16(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm11 |
| pslld $12,%xmm11 |
| psrld $20,%xmm6 |
| pxor %xmm11,%xmm6 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol8(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm11 |
| pslld $7,%xmm11 |
| psrld $25,%xmm6 |
| pxor %xmm11,%xmm6 |
| .byte 102,15,58,15,246,4 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,12 |
| movdqa 0+80(%rbp),%xmm11 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movdqa %xmm9,0+80(%rbp) |
| paddd %xmm7,%xmm3 |
| pxor %xmm3,%xmm15 |
| pshufb .Lrol16(%rip),%xmm15 |
| paddd %xmm15,%xmm11 |
| pxor %xmm11,%xmm7 |
| movdqa %xmm7,%xmm9 |
| pslld $12,%xmm9 |
| psrld $20,%xmm7 |
| pxor %xmm9,%xmm7 |
| paddd %xmm7,%xmm3 |
| pxor %xmm3,%xmm15 |
| pshufb .Lrol8(%rip),%xmm15 |
| paddd %xmm15,%xmm11 |
| pxor %xmm11,%xmm7 |
| movdqa %xmm7,%xmm9 |
| pslld $7,%xmm9 |
| psrld $25,%xmm7 |
| pxor %xmm9,%xmm7 |
| .byte 102,15,58,15,255,4 |
| .byte 102,69,15,58,15,219,8 |
| .byte 102,69,15,58,15,255,12 |
| movdqa 0+80(%rbp),%xmm9 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| movdqa %xmm11,0+80(%rbp) |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm11 |
| pslld $12,%xmm11 |
| psrld $20,%xmm4 |
| pxor %xmm11,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm11 |
| pslld $7,%xmm11 |
| psrld $25,%xmm4 |
| pxor %xmm11,%xmm4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm11 |
| pslld $12,%xmm11 |
| psrld $20,%xmm5 |
| pxor %xmm11,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm11 |
| pslld $7,%xmm11 |
| psrld $25,%xmm5 |
| pxor %xmm11,%xmm5 |
| .byte 102,15,58,15,237,12 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,4 |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol16(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm11 |
| pslld $12,%xmm11 |
| psrld $20,%xmm6 |
| pxor %xmm11,%xmm6 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol8(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm11 |
| pslld $7,%xmm11 |
| psrld $25,%xmm6 |
| pxor %xmm11,%xmm6 |
| .byte 102,15,58,15,246,12 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,4 |
| movdqa 0+80(%rbp),%xmm11 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| movdqa %xmm9,0+80(%rbp) |
| paddd %xmm7,%xmm3 |
| pxor %xmm3,%xmm15 |
| pshufb .Lrol16(%rip),%xmm15 |
| paddd %xmm15,%xmm11 |
| pxor %xmm11,%xmm7 |
| movdqa %xmm7,%xmm9 |
| pslld $12,%xmm9 |
| psrld $20,%xmm7 |
| pxor %xmm9,%xmm7 |
| paddd %xmm7,%xmm3 |
| pxor %xmm3,%xmm15 |
| pshufb .Lrol8(%rip),%xmm15 |
| paddd %xmm15,%xmm11 |
| pxor %xmm11,%xmm7 |
| movdqa %xmm7,%xmm9 |
| pslld $7,%xmm9 |
| psrld $25,%xmm7 |
| pxor %xmm9,%xmm7 |
| .byte 102,15,58,15,255,12 |
| .byte 102,69,15,58,15,219,8 |
| .byte 102,69,15,58,15,255,4 |
| movdqa 0+80(%rbp),%xmm9 |
| |
| addq $16,%r8 |
| cmpq $160,%r8 |
| jb .Lopen_sse_tail_256_rounds_and_x1hash |
| |
| movq %rbx,%rcx |
| andq $-16,%rcx |
| .Lopen_sse_tail_256_hash: |
| addq 0+0(%rsi,%r8,1),%r10 |
| adcq 8+0(%rsi,%r8,1),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| addq $16,%r8 |
| cmpq %rcx,%r8 |
| jb .Lopen_sse_tail_256_hash |
| paddd .Lchacha20_consts(%rip),%xmm3 |
| paddd 0+48(%rbp),%xmm7 |
| paddd 0+64(%rbp),%xmm11 |
| paddd 0+144(%rbp),%xmm15 |
| paddd .Lchacha20_consts(%rip),%xmm2 |
| paddd 0+48(%rbp),%xmm6 |
| paddd 0+64(%rbp),%xmm10 |
| paddd 0+128(%rbp),%xmm14 |
| paddd .Lchacha20_consts(%rip),%xmm1 |
| paddd 0+48(%rbp),%xmm5 |
| paddd 0+64(%rbp),%xmm9 |
| paddd 0+112(%rbp),%xmm13 |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd 0+48(%rbp),%xmm4 |
| paddd 0+64(%rbp),%xmm8 |
| paddd 0+96(%rbp),%xmm12 |
| movdqa %xmm12,0+80(%rbp) |
| movdqu 0 + 0(%rsi),%xmm12 |
| pxor %xmm3,%xmm12 |
| movdqu %xmm12,0 + 0(%rdi) |
| movdqu 16 + 0(%rsi),%xmm12 |
| pxor %xmm7,%xmm12 |
| movdqu %xmm12,16 + 0(%rdi) |
| movdqu 32 + 0(%rsi),%xmm12 |
| pxor %xmm11,%xmm12 |
| movdqu %xmm12,32 + 0(%rdi) |
| movdqu 48 + 0(%rsi),%xmm12 |
| pxor %xmm15,%xmm12 |
| movdqu %xmm12,48 + 0(%rdi) |
| movdqu 0 + 64(%rsi),%xmm3 |
| movdqu 16 + 64(%rsi),%xmm7 |
| movdqu 32 + 64(%rsi),%xmm11 |
| movdqu 48 + 64(%rsi),%xmm15 |
| pxor %xmm3,%xmm2 |
| pxor %xmm7,%xmm6 |
| pxor %xmm11,%xmm10 |
| pxor %xmm14,%xmm15 |
| movdqu %xmm2,0 + 64(%rdi) |
| movdqu %xmm6,16 + 64(%rdi) |
| movdqu %xmm10,32 + 64(%rdi) |
| movdqu %xmm15,48 + 64(%rdi) |
| movdqu 0 + 128(%rsi),%xmm3 |
| movdqu 16 + 128(%rsi),%xmm7 |
| movdqu 32 + 128(%rsi),%xmm11 |
| movdqu 48 + 128(%rsi),%xmm15 |
| pxor %xmm3,%xmm1 |
| pxor %xmm7,%xmm5 |
| pxor %xmm11,%xmm9 |
| pxor %xmm13,%xmm15 |
| movdqu %xmm1,0 + 128(%rdi) |
| movdqu %xmm5,16 + 128(%rdi) |
| movdqu %xmm9,32 + 128(%rdi) |
| movdqu %xmm15,48 + 128(%rdi) |
| |
| movdqa 0+80(%rbp),%xmm12 |
| subq $192,%rbx |
| leaq 192(%rsi),%rsi |
| leaq 192(%rdi),%rdi |
| |
| |
| .Lopen_sse_tail_64_dec_loop: |
| cmpq $16,%rbx |
| jb .Lopen_sse_tail_16_init |
| subq $16,%rbx |
| movdqu (%rsi),%xmm3 |
| pxor %xmm3,%xmm0 |
| movdqu %xmm0,(%rdi) |
| leaq 16(%rsi),%rsi |
| leaq 16(%rdi),%rdi |
| movdqa %xmm4,%xmm0 |
| movdqa %xmm8,%xmm4 |
| movdqa %xmm12,%xmm8 |
| jmp .Lopen_sse_tail_64_dec_loop |
| .Lopen_sse_tail_16_init: |
| movdqa %xmm0,%xmm1 |
| |
| |
| .Lopen_sse_tail_16: |
| testq %rbx,%rbx |
| jz .Lopen_sse_finalize |
| |
| |
| |
| pxor %xmm3,%xmm3 |
| leaq -1(%rsi,%rbx,1),%rsi |
| movq %rbx,%r8 |
| .Lopen_sse_tail_16_compose: |
| pslldq $1,%xmm3 |
| pinsrb $0,(%rsi),%xmm3 |
| subq $1,%rsi |
| subq $1,%r8 |
| jnz .Lopen_sse_tail_16_compose |
| |
| .byte 102,73,15,126,221 |
| pextrq $1,%xmm3,%r14 |
| |
| pxor %xmm1,%xmm3 |
| |
| |
| .Lopen_sse_tail_16_extract: |
| pextrb $0,%xmm3,(%rdi) |
| psrldq $1,%xmm3 |
| addq $1,%rdi |
| subq $1,%rbx |
| jne .Lopen_sse_tail_16_extract |
| |
| addq %r13,%r10 |
| adcq %r14,%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| |
| .Lopen_sse_finalize: |
| addq 0+0+32(%rbp),%r10 |
| adcq 8+0+32(%rbp),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| |
| movq %r10,%r13 |
| movq %r11,%r14 |
| movq %r12,%r15 |
| subq $-5,%r10 |
| sbbq $-1,%r11 |
| sbbq $3,%r12 |
| cmovcq %r13,%r10 |
| cmovcq %r14,%r11 |
| cmovcq %r15,%r12 |
| |
| addq 0+0+16(%rbp),%r10 |
| adcq 8+0+16(%rbp),%r11 |
| |
| .cfi_remember_state |
| addq $288 + 0 + 32,%rsp |
| .cfi_adjust_cfa_offset -(288 + 32) |
| |
| popq %r9 |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %r9 |
| movq %r10,(%r9) |
| movq %r11,8(%r9) |
| popq %r15 |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %r15 |
| popq %r14 |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %r14 |
| popq %r13 |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %r13 |
| popq %r12 |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %r12 |
| popq %rbx |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %rbx |
| popq %rbp |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %rbp |
| ret |
| |
| .Lopen_sse_128: |
| .cfi_restore_state |
| movdqu .Lchacha20_consts(%rip),%xmm0 |
| movdqa %xmm0,%xmm1 |
| movdqa %xmm0,%xmm2 |
| movdqu 0(%r9),%xmm4 |
| movdqa %xmm4,%xmm5 |
| movdqa %xmm4,%xmm6 |
| movdqu 16(%r9),%xmm8 |
| movdqa %xmm8,%xmm9 |
| movdqa %xmm8,%xmm10 |
| movdqu 32(%r9),%xmm12 |
| movdqa %xmm12,%xmm13 |
| paddd .Lsse_inc(%rip),%xmm13 |
| movdqa %xmm13,%xmm14 |
| paddd .Lsse_inc(%rip),%xmm14 |
| movdqa %xmm4,%xmm7 |
| movdqa %xmm8,%xmm11 |
| movdqa %xmm13,%xmm15 |
| movq $10,%r10 |
| |
| .Lopen_sse_128_rounds: |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,4 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,12 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol16(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm6 |
| pxor %xmm3,%xmm6 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol8(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm6 |
| pxor %xmm3,%xmm6 |
| .byte 102,15,58,15,246,4 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,12 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,12 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,4 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol16(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm6 |
| pxor %xmm3,%xmm6 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol8(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm6 |
| pxor %xmm3,%xmm6 |
| .byte 102,15,58,15,246,12 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,4 |
| |
| decq %r10 |
| jnz .Lopen_sse_128_rounds |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd .Lchacha20_consts(%rip),%xmm1 |
| paddd .Lchacha20_consts(%rip),%xmm2 |
| paddd %xmm7,%xmm4 |
| paddd %xmm7,%xmm5 |
| paddd %xmm7,%xmm6 |
| paddd %xmm11,%xmm9 |
| paddd %xmm11,%xmm10 |
| paddd %xmm15,%xmm13 |
| paddd .Lsse_inc(%rip),%xmm15 |
| paddd %xmm15,%xmm14 |
| |
| pand .Lclamp(%rip),%xmm0 |
| movdqa %xmm0,0+0(%rbp) |
| movdqa %xmm4,0+16(%rbp) |
| |
| movq %r8,%r8 |
| call poly_hash_ad_internal |
| .Lopen_sse_128_xor_hash: |
| cmpq $16,%rbx |
| jb .Lopen_sse_tail_16 |
| subq $16,%rbx |
| addq 0+0(%rsi),%r10 |
| adcq 8+0(%rsi),%r11 |
| adcq $1,%r12 |
| |
| |
| movdqu 0(%rsi),%xmm3 |
| pxor %xmm3,%xmm1 |
| movdqu %xmm1,0(%rdi) |
| leaq 16(%rsi),%rsi |
| leaq 16(%rdi),%rdi |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| |
| movdqa %xmm5,%xmm1 |
| movdqa %xmm9,%xmm5 |
| movdqa %xmm13,%xmm9 |
| movdqa %xmm2,%xmm13 |
| movdqa %xmm6,%xmm2 |
| movdqa %xmm10,%xmm6 |
| movdqa %xmm14,%xmm10 |
| jmp .Lopen_sse_128_xor_hash |
| .size chacha20_poly1305_open_nohw, .-chacha20_poly1305_open_nohw |
| .cfi_endproc |
| |
| |
| |
| |
| |
| |
| |
| .globl chacha20_poly1305_seal_nohw |
| .hidden chacha20_poly1305_seal_nohw |
| .type chacha20_poly1305_seal_nohw,@function |
| .align 64 |
| chacha20_poly1305_seal_nohw: |
| .cfi_startproc |
| _CET_ENDBR |
| pushq %rbp |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %rbp,-16 |
| pushq %rbx |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %rbx,-24 |
| pushq %r12 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r12,-32 |
| pushq %r13 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r13,-40 |
| pushq %r14 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r14,-48 |
| pushq %r15 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r15,-56 |
| |
| |
| pushq %r9 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r9,-64 |
| subq $288 + 0 + 32,%rsp |
| .cfi_adjust_cfa_offset 288 + 32 |
| leaq 32(%rsp),%rbp |
| andq $-32,%rbp |
| |
| movq 56(%r9),%rbx |
| addq %rdx,%rbx |
| movq %r8,0+0+32(%rbp) |
| movq %rbx,8+0+32(%rbp) |
| movq %rdx,%rbx |
| |
| cmpq $128,%rbx |
| jbe .Lseal_sse_128 |
| |
| movdqa .Lchacha20_consts(%rip),%xmm0 |
| movdqu 0(%r9),%xmm4 |
| movdqu 16(%r9),%xmm8 |
| movdqu 32(%r9),%xmm12 |
| |
| movdqa %xmm0,%xmm1 |
| movdqa %xmm0,%xmm2 |
| movdqa %xmm0,%xmm3 |
| movdqa %xmm4,%xmm5 |
| movdqa %xmm4,%xmm6 |
| movdqa %xmm4,%xmm7 |
| movdqa %xmm8,%xmm9 |
| movdqa %xmm8,%xmm10 |
| movdqa %xmm8,%xmm11 |
| movdqa %xmm12,%xmm15 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,%xmm14 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,%xmm13 |
| paddd .Lsse_inc(%rip),%xmm12 |
| |
| movdqa %xmm4,0+48(%rbp) |
| movdqa %xmm8,0+64(%rbp) |
| movdqa %xmm12,0+96(%rbp) |
| movdqa %xmm13,0+112(%rbp) |
| movdqa %xmm14,0+128(%rbp) |
| movdqa %xmm15,0+144(%rbp) |
| movq $10,%r10 |
| .Lseal_sse_init_rounds: |
| movdqa %xmm8,0+80(%rbp) |
| movdqa .Lrol16(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| pxor %xmm10,%xmm6 |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm4 |
| pxor %xmm8,%xmm4 |
| movdqa .Lrol8(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| pxor %xmm10,%xmm6 |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm4 |
| pxor %xmm8,%xmm4 |
| movdqa 0+80(%rbp),%xmm8 |
| .byte 102,15,58,15,255,4 |
| .byte 102,69,15,58,15,219,8 |
| .byte 102,69,15,58,15,255,12 |
| .byte 102,15,58,15,246,4 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,12 |
| .byte 102,15,58,15,237,4 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,12 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa .Lrol16(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| pxor %xmm10,%xmm6 |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm4 |
| pxor %xmm8,%xmm4 |
| movdqa .Lrol8(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| pxor %xmm10,%xmm6 |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm4 |
| pxor %xmm8,%xmm4 |
| movdqa 0+80(%rbp),%xmm8 |
| .byte 102,15,58,15,255,12 |
| .byte 102,69,15,58,15,219,8 |
| .byte 102,69,15,58,15,255,4 |
| .byte 102,15,58,15,246,12 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,4 |
| .byte 102,15,58,15,237,12 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| |
| decq %r10 |
| jnz .Lseal_sse_init_rounds |
| paddd .Lchacha20_consts(%rip),%xmm3 |
| paddd 0+48(%rbp),%xmm7 |
| paddd 0+64(%rbp),%xmm11 |
| paddd 0+144(%rbp),%xmm15 |
| paddd .Lchacha20_consts(%rip),%xmm2 |
| paddd 0+48(%rbp),%xmm6 |
| paddd 0+64(%rbp),%xmm10 |
| paddd 0+128(%rbp),%xmm14 |
| paddd .Lchacha20_consts(%rip),%xmm1 |
| paddd 0+48(%rbp),%xmm5 |
| paddd 0+64(%rbp),%xmm9 |
| paddd 0+112(%rbp),%xmm13 |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd 0+48(%rbp),%xmm4 |
| paddd 0+64(%rbp),%xmm8 |
| paddd 0+96(%rbp),%xmm12 |
| |
| |
| pand .Lclamp(%rip),%xmm3 |
| movdqa %xmm3,0+0(%rbp) |
| movdqa %xmm7,0+16(%rbp) |
| |
| movq %r8,%r8 |
| call poly_hash_ad_internal |
| movdqu 0 + 0(%rsi),%xmm3 |
| movdqu 16 + 0(%rsi),%xmm7 |
| movdqu 32 + 0(%rsi),%xmm11 |
| movdqu 48 + 0(%rsi),%xmm15 |
| pxor %xmm3,%xmm2 |
| pxor %xmm7,%xmm6 |
| pxor %xmm11,%xmm10 |
| pxor %xmm14,%xmm15 |
| movdqu %xmm2,0 + 0(%rdi) |
| movdqu %xmm6,16 + 0(%rdi) |
| movdqu %xmm10,32 + 0(%rdi) |
| movdqu %xmm15,48 + 0(%rdi) |
| movdqu 0 + 64(%rsi),%xmm3 |
| movdqu 16 + 64(%rsi),%xmm7 |
| movdqu 32 + 64(%rsi),%xmm11 |
| movdqu 48 + 64(%rsi),%xmm15 |
| pxor %xmm3,%xmm1 |
| pxor %xmm7,%xmm5 |
| pxor %xmm11,%xmm9 |
| pxor %xmm13,%xmm15 |
| movdqu %xmm1,0 + 64(%rdi) |
| movdqu %xmm5,16 + 64(%rdi) |
| movdqu %xmm9,32 + 64(%rdi) |
| movdqu %xmm15,48 + 64(%rdi) |
| |
| cmpq $192,%rbx |
| ja .Lseal_sse_main_init |
| movq $128,%rcx |
| subq $128,%rbx |
| leaq 128(%rsi),%rsi |
| jmp .Lseal_sse_128_tail_hash |
| .Lseal_sse_main_init: |
| movdqu 0 + 128(%rsi),%xmm3 |
| movdqu 16 + 128(%rsi),%xmm7 |
| movdqu 32 + 128(%rsi),%xmm11 |
| movdqu 48 + 128(%rsi),%xmm15 |
| pxor %xmm3,%xmm0 |
| pxor %xmm7,%xmm4 |
| pxor %xmm11,%xmm8 |
| pxor %xmm12,%xmm15 |
| movdqu %xmm0,0 + 128(%rdi) |
| movdqu %xmm4,16 + 128(%rdi) |
| movdqu %xmm8,32 + 128(%rdi) |
| movdqu %xmm15,48 + 128(%rdi) |
| |
| movq $192,%rcx |
| subq $192,%rbx |
| leaq 192(%rsi),%rsi |
| movq $2,%rcx |
| movq $8,%r8 |
| cmpq $64,%rbx |
| jbe .Lseal_sse_tail_64 |
| cmpq $128,%rbx |
| jbe .Lseal_sse_tail_128 |
| cmpq $192,%rbx |
| jbe .Lseal_sse_tail_192 |
| |
| .Lseal_sse_main_loop: |
| movdqa .Lchacha20_consts(%rip),%xmm0 |
| movdqa 0+48(%rbp),%xmm4 |
| movdqa 0+64(%rbp),%xmm8 |
| movdqa %xmm0,%xmm1 |
| movdqa %xmm4,%xmm5 |
| movdqa %xmm8,%xmm9 |
| movdqa %xmm0,%xmm2 |
| movdqa %xmm4,%xmm6 |
| movdqa %xmm8,%xmm10 |
| movdqa %xmm0,%xmm3 |
| movdqa %xmm4,%xmm7 |
| movdqa %xmm8,%xmm11 |
| movdqa 0+96(%rbp),%xmm15 |
| paddd .Lsse_inc(%rip),%xmm15 |
| movdqa %xmm15,%xmm14 |
| paddd .Lsse_inc(%rip),%xmm14 |
| movdqa %xmm14,%xmm13 |
| paddd .Lsse_inc(%rip),%xmm13 |
| movdqa %xmm13,%xmm12 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,0+96(%rbp) |
| movdqa %xmm13,0+112(%rbp) |
| movdqa %xmm14,0+128(%rbp) |
| movdqa %xmm15,0+144(%rbp) |
| |
| .align 32 |
| .Lseal_sse_main_rounds: |
| movdqa %xmm8,0+80(%rbp) |
| movdqa .Lrol16(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| pxor %xmm10,%xmm6 |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm4 |
| pxor %xmm8,%xmm4 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movdqa .Lrol8(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| pxor %xmm10,%xmm6 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm4 |
| pxor %xmm8,%xmm4 |
| movdqa 0+80(%rbp),%xmm8 |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| .byte 102,15,58,15,255,4 |
| .byte 102,69,15,58,15,219,8 |
| .byte 102,69,15,58,15,255,12 |
| .byte 102,15,58,15,246,4 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,12 |
| .byte 102,15,58,15,237,4 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,12 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa .Lrol16(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| pxor %xmm10,%xmm6 |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $20,%xmm8 |
| pslld $32-20,%xmm4 |
| pxor %xmm8,%xmm4 |
| movdqa .Lrol8(%rip),%xmm8 |
| paddd %xmm7,%xmm3 |
| paddd %xmm6,%xmm2 |
| paddd %xmm5,%xmm1 |
| paddd %xmm4,%xmm0 |
| pxor %xmm3,%xmm15 |
| pxor %xmm2,%xmm14 |
| pxor %xmm1,%xmm13 |
| pxor %xmm0,%xmm12 |
| .byte 102,69,15,56,0,248 |
| .byte 102,69,15,56,0,240 |
| .byte 102,69,15,56,0,232 |
| .byte 102,69,15,56,0,224 |
| movdqa 0+80(%rbp),%xmm8 |
| paddd %xmm15,%xmm11 |
| paddd %xmm14,%xmm10 |
| paddd %xmm13,%xmm9 |
| paddd %xmm12,%xmm8 |
| pxor %xmm11,%xmm7 |
| pxor %xmm10,%xmm6 |
| pxor %xmm9,%xmm5 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm8,0+80(%rbp) |
| movdqa %xmm7,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm7 |
| pxor %xmm8,%xmm7 |
| movdqa %xmm6,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm6 |
| pxor %xmm8,%xmm6 |
| movdqa %xmm5,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm5 |
| pxor %xmm8,%xmm5 |
| movdqa %xmm4,%xmm8 |
| psrld $25,%xmm8 |
| pslld $32-25,%xmm4 |
| pxor %xmm8,%xmm4 |
| movdqa 0+80(%rbp),%xmm8 |
| .byte 102,15,58,15,255,12 |
| .byte 102,69,15,58,15,219,8 |
| .byte 102,69,15,58,15,255,4 |
| .byte 102,15,58,15,246,12 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,4 |
| .byte 102,15,58,15,237,12 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| |
| leaq 16(%rdi),%rdi |
| decq %r8 |
| jge .Lseal_sse_main_rounds |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rdi),%rdi |
| decq %rcx |
| jg .Lseal_sse_main_rounds |
| paddd .Lchacha20_consts(%rip),%xmm3 |
| paddd 0+48(%rbp),%xmm7 |
| paddd 0+64(%rbp),%xmm11 |
| paddd 0+144(%rbp),%xmm15 |
| paddd .Lchacha20_consts(%rip),%xmm2 |
| paddd 0+48(%rbp),%xmm6 |
| paddd 0+64(%rbp),%xmm10 |
| paddd 0+128(%rbp),%xmm14 |
| paddd .Lchacha20_consts(%rip),%xmm1 |
| paddd 0+48(%rbp),%xmm5 |
| paddd 0+64(%rbp),%xmm9 |
| paddd 0+112(%rbp),%xmm13 |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd 0+48(%rbp),%xmm4 |
| paddd 0+64(%rbp),%xmm8 |
| paddd 0+96(%rbp),%xmm12 |
| |
| movdqa %xmm14,0+80(%rbp) |
| movdqa %xmm14,0+80(%rbp) |
| movdqu 0 + 0(%rsi),%xmm14 |
| pxor %xmm3,%xmm14 |
| movdqu %xmm14,0 + 0(%rdi) |
| movdqu 16 + 0(%rsi),%xmm14 |
| pxor %xmm7,%xmm14 |
| movdqu %xmm14,16 + 0(%rdi) |
| movdqu 32 + 0(%rsi),%xmm14 |
| pxor %xmm11,%xmm14 |
| movdqu %xmm14,32 + 0(%rdi) |
| movdqu 48 + 0(%rsi),%xmm14 |
| pxor %xmm15,%xmm14 |
| movdqu %xmm14,48 + 0(%rdi) |
| |
| movdqa 0+80(%rbp),%xmm14 |
| movdqu 0 + 64(%rsi),%xmm3 |
| movdqu 16 + 64(%rsi),%xmm7 |
| movdqu 32 + 64(%rsi),%xmm11 |
| movdqu 48 + 64(%rsi),%xmm15 |
| pxor %xmm3,%xmm2 |
| pxor %xmm7,%xmm6 |
| pxor %xmm11,%xmm10 |
| pxor %xmm14,%xmm15 |
| movdqu %xmm2,0 + 64(%rdi) |
| movdqu %xmm6,16 + 64(%rdi) |
| movdqu %xmm10,32 + 64(%rdi) |
| movdqu %xmm15,48 + 64(%rdi) |
| movdqu 0 + 128(%rsi),%xmm3 |
| movdqu 16 + 128(%rsi),%xmm7 |
| movdqu 32 + 128(%rsi),%xmm11 |
| movdqu 48 + 128(%rsi),%xmm15 |
| pxor %xmm3,%xmm1 |
| pxor %xmm7,%xmm5 |
| pxor %xmm11,%xmm9 |
| pxor %xmm13,%xmm15 |
| movdqu %xmm1,0 + 128(%rdi) |
| movdqu %xmm5,16 + 128(%rdi) |
| movdqu %xmm9,32 + 128(%rdi) |
| movdqu %xmm15,48 + 128(%rdi) |
| |
| cmpq $256,%rbx |
| ja .Lseal_sse_main_loop_xor |
| |
| movq $192,%rcx |
| subq $192,%rbx |
| leaq 192(%rsi),%rsi |
| jmp .Lseal_sse_128_tail_hash |
| .Lseal_sse_main_loop_xor: |
| movdqu 0 + 192(%rsi),%xmm3 |
| movdqu 16 + 192(%rsi),%xmm7 |
| movdqu 32 + 192(%rsi),%xmm11 |
| movdqu 48 + 192(%rsi),%xmm15 |
| pxor %xmm3,%xmm0 |
| pxor %xmm7,%xmm4 |
| pxor %xmm11,%xmm8 |
| pxor %xmm12,%xmm15 |
| movdqu %xmm0,0 + 192(%rdi) |
| movdqu %xmm4,16 + 192(%rdi) |
| movdqu %xmm8,32 + 192(%rdi) |
| movdqu %xmm15,48 + 192(%rdi) |
| |
| leaq 256(%rsi),%rsi |
| subq $256,%rbx |
| movq $6,%rcx |
| movq $4,%r8 |
| cmpq $192,%rbx |
| jg .Lseal_sse_main_loop |
| movq %rbx,%rcx |
| testq %rbx,%rbx |
| je .Lseal_sse_128_tail_hash |
| movq $6,%rcx |
| cmpq $128,%rbx |
| ja .Lseal_sse_tail_192 |
| cmpq $64,%rbx |
| ja .Lseal_sse_tail_128 |
| |
| .Lseal_sse_tail_64: |
| movdqa .Lchacha20_consts(%rip),%xmm0 |
| movdqa 0+48(%rbp),%xmm4 |
| movdqa 0+64(%rbp),%xmm8 |
| movdqa 0+96(%rbp),%xmm12 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,0+96(%rbp) |
| |
| .Lseal_sse_tail_64_rounds_and_x2hash: |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rdi),%rdi |
| .Lseal_sse_tail_64_rounds_and_x1hash: |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rdi),%rdi |
| decq %rcx |
| jg .Lseal_sse_tail_64_rounds_and_x2hash |
| decq %r8 |
| jge .Lseal_sse_tail_64_rounds_and_x1hash |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd 0+48(%rbp),%xmm4 |
| paddd 0+64(%rbp),%xmm8 |
| paddd 0+96(%rbp),%xmm12 |
| |
| jmp .Lseal_sse_128_tail_xor |
| |
| .Lseal_sse_tail_128: |
| movdqa .Lchacha20_consts(%rip),%xmm0 |
| movdqa 0+48(%rbp),%xmm4 |
| movdqa 0+64(%rbp),%xmm8 |
| movdqa %xmm0,%xmm1 |
| movdqa %xmm4,%xmm5 |
| movdqa %xmm8,%xmm9 |
| movdqa 0+96(%rbp),%xmm13 |
| paddd .Lsse_inc(%rip),%xmm13 |
| movdqa %xmm13,%xmm12 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,0+96(%rbp) |
| movdqa %xmm13,0+112(%rbp) |
| |
| .Lseal_sse_tail_128_rounds_and_x2hash: |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rdi),%rdi |
| .Lseal_sse_tail_128_rounds_and_x1hash: |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,4 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,12 |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,12 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,4 |
| |
| leaq 16(%rdi),%rdi |
| decq %rcx |
| jg .Lseal_sse_tail_128_rounds_and_x2hash |
| decq %r8 |
| jge .Lseal_sse_tail_128_rounds_and_x1hash |
| paddd .Lchacha20_consts(%rip),%xmm1 |
| paddd 0+48(%rbp),%xmm5 |
| paddd 0+64(%rbp),%xmm9 |
| paddd 0+112(%rbp),%xmm13 |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd 0+48(%rbp),%xmm4 |
| paddd 0+64(%rbp),%xmm8 |
| paddd 0+96(%rbp),%xmm12 |
| movdqu 0 + 0(%rsi),%xmm3 |
| movdqu 16 + 0(%rsi),%xmm7 |
| movdqu 32 + 0(%rsi),%xmm11 |
| movdqu 48 + 0(%rsi),%xmm15 |
| pxor %xmm3,%xmm1 |
| pxor %xmm7,%xmm5 |
| pxor %xmm11,%xmm9 |
| pxor %xmm13,%xmm15 |
| movdqu %xmm1,0 + 0(%rdi) |
| movdqu %xmm5,16 + 0(%rdi) |
| movdqu %xmm9,32 + 0(%rdi) |
| movdqu %xmm15,48 + 0(%rdi) |
| |
| movq $64,%rcx |
| subq $64,%rbx |
| leaq 64(%rsi),%rsi |
| jmp .Lseal_sse_128_tail_hash |
| |
| .Lseal_sse_tail_192: |
| movdqa .Lchacha20_consts(%rip),%xmm0 |
| movdqa 0+48(%rbp),%xmm4 |
| movdqa 0+64(%rbp),%xmm8 |
| movdqa %xmm0,%xmm1 |
| movdqa %xmm4,%xmm5 |
| movdqa %xmm8,%xmm9 |
| movdqa %xmm0,%xmm2 |
| movdqa %xmm4,%xmm6 |
| movdqa %xmm8,%xmm10 |
| movdqa 0+96(%rbp),%xmm14 |
| paddd .Lsse_inc(%rip),%xmm14 |
| movdqa %xmm14,%xmm13 |
| paddd .Lsse_inc(%rip),%xmm13 |
| movdqa %xmm13,%xmm12 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,0+96(%rbp) |
| movdqa %xmm13,0+112(%rbp) |
| movdqa %xmm14,0+128(%rbp) |
| |
| .Lseal_sse_tail_192_rounds_and_x2hash: |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rdi),%rdi |
| .Lseal_sse_tail_192_rounds_and_x1hash: |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,4 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,12 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol16(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm6 |
| pxor %xmm3,%xmm6 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol8(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm6 |
| pxor %xmm3,%xmm6 |
| .byte 102,15,58,15,246,4 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,12 |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,12 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,4 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol16(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm6 |
| pxor %xmm3,%xmm6 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol8(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm6 |
| pxor %xmm3,%xmm6 |
| .byte 102,15,58,15,246,12 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,4 |
| |
| leaq 16(%rdi),%rdi |
| decq %rcx |
| jg .Lseal_sse_tail_192_rounds_and_x2hash |
| decq %r8 |
| jge .Lseal_sse_tail_192_rounds_and_x1hash |
| paddd .Lchacha20_consts(%rip),%xmm2 |
| paddd 0+48(%rbp),%xmm6 |
| paddd 0+64(%rbp),%xmm10 |
| paddd 0+128(%rbp),%xmm14 |
| paddd .Lchacha20_consts(%rip),%xmm1 |
| paddd 0+48(%rbp),%xmm5 |
| paddd 0+64(%rbp),%xmm9 |
| paddd 0+112(%rbp),%xmm13 |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd 0+48(%rbp),%xmm4 |
| paddd 0+64(%rbp),%xmm8 |
| paddd 0+96(%rbp),%xmm12 |
| movdqu 0 + 0(%rsi),%xmm3 |
| movdqu 16 + 0(%rsi),%xmm7 |
| movdqu 32 + 0(%rsi),%xmm11 |
| movdqu 48 + 0(%rsi),%xmm15 |
| pxor %xmm3,%xmm2 |
| pxor %xmm7,%xmm6 |
| pxor %xmm11,%xmm10 |
| pxor %xmm14,%xmm15 |
| movdqu %xmm2,0 + 0(%rdi) |
| movdqu %xmm6,16 + 0(%rdi) |
| movdqu %xmm10,32 + 0(%rdi) |
| movdqu %xmm15,48 + 0(%rdi) |
| movdqu 0 + 64(%rsi),%xmm3 |
| movdqu 16 + 64(%rsi),%xmm7 |
| movdqu 32 + 64(%rsi),%xmm11 |
| movdqu 48 + 64(%rsi),%xmm15 |
| pxor %xmm3,%xmm1 |
| pxor %xmm7,%xmm5 |
| pxor %xmm11,%xmm9 |
| pxor %xmm13,%xmm15 |
| movdqu %xmm1,0 + 64(%rdi) |
| movdqu %xmm5,16 + 64(%rdi) |
| movdqu %xmm9,32 + 64(%rdi) |
| movdqu %xmm15,48 + 64(%rdi) |
| |
| movq $128,%rcx |
| subq $128,%rbx |
| leaq 128(%rsi),%rsi |
| |
| .Lseal_sse_128_tail_hash: |
| cmpq $16,%rcx |
| jb .Lseal_sse_128_tail_xor |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| subq $16,%rcx |
| leaq 16(%rdi),%rdi |
| jmp .Lseal_sse_128_tail_hash |
| |
| .Lseal_sse_128_tail_xor: |
| cmpq $16,%rbx |
| jb .Lseal_sse_tail_16 |
| subq $16,%rbx |
| |
| movdqu 0(%rsi),%xmm3 |
| pxor %xmm3,%xmm0 |
| movdqu %xmm0,0(%rdi) |
| |
| addq 0(%rdi),%r10 |
| adcq 8(%rdi),%r11 |
| adcq $1,%r12 |
| leaq 16(%rsi),%rsi |
| leaq 16(%rdi),%rdi |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| |
| movdqa %xmm4,%xmm0 |
| movdqa %xmm8,%xmm4 |
| movdqa %xmm12,%xmm8 |
| movdqa %xmm1,%xmm12 |
| movdqa %xmm5,%xmm1 |
| movdqa %xmm9,%xmm5 |
| movdqa %xmm13,%xmm9 |
| jmp .Lseal_sse_128_tail_xor |
| |
| .Lseal_sse_tail_16: |
| testq %rbx,%rbx |
| jz .Lprocess_blocks_of_extra_in |
| |
| movq %rbx,%r8 |
| movq %rbx,%rcx |
| leaq -1(%rsi,%rbx,1),%rsi |
| pxor %xmm15,%xmm15 |
| .Lseal_sse_tail_16_compose: |
| pslldq $1,%xmm15 |
| pinsrb $0,(%rsi),%xmm15 |
| leaq -1(%rsi),%rsi |
| decq %rcx |
| jne .Lseal_sse_tail_16_compose |
| |
| |
| pxor %xmm0,%xmm15 |
| |
| |
| movq %rbx,%rcx |
| movdqu %xmm15,%xmm0 |
| .Lseal_sse_tail_16_extract: |
| pextrb $0,%xmm0,(%rdi) |
| psrldq $1,%xmm0 |
| addq $1,%rdi |
| subq $1,%rcx |
| jnz .Lseal_sse_tail_16_extract |
| |
| |
| |
| |
| |
| |
| |
| |
| movq 288 + 0 + 32(%rsp),%r9 |
| movq 56(%r9),%r14 |
| movq 48(%r9),%r13 |
| testq %r14,%r14 |
| jz .Lprocess_partial_block |
| |
| movq $16,%r15 |
| subq %rbx,%r15 |
| cmpq %r15,%r14 |
| |
| jge .Lload_extra_in |
| movq %r14,%r15 |
| |
| .Lload_extra_in: |
| |
| |
| leaq -1(%r13,%r15,1),%rsi |
| |
| |
| addq %r15,%r13 |
| subq %r15,%r14 |
| movq %r13,48(%r9) |
| movq %r14,56(%r9) |
| |
| |
| |
| addq %r15,%r8 |
| |
| |
| pxor %xmm11,%xmm11 |
| .Lload_extra_load_loop: |
| pslldq $1,%xmm11 |
| pinsrb $0,(%rsi),%xmm11 |
| leaq -1(%rsi),%rsi |
| subq $1,%r15 |
| jnz .Lload_extra_load_loop |
| |
| |
| |
| |
| movq %rbx,%r15 |
| |
| .Lload_extra_shift_loop: |
| pslldq $1,%xmm11 |
| subq $1,%r15 |
| jnz .Lload_extra_shift_loop |
| |
| |
| |
| |
| leaq .Land_masks(%rip),%r15 |
| shlq $4,%rbx |
| pand -16(%r15,%rbx,1),%xmm15 |
| |
| |
| por %xmm11,%xmm15 |
| |
| |
| |
| .byte 102,77,15,126,253 |
| pextrq $1,%xmm15,%r14 |
| addq %r13,%r10 |
| adcq %r14,%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| |
| .Lprocess_blocks_of_extra_in: |
| |
| movq 288+32+0 (%rsp),%r9 |
| movq 48(%r9),%rsi |
| movq 56(%r9),%r8 |
| movq %r8,%rcx |
| shrq $4,%r8 |
| |
| .Lprocess_extra_hash_loop: |
| jz process_extra_in_trailer |
| addq 0+0(%rsi),%r10 |
| adcq 8+0(%rsi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rsi),%rsi |
| subq $1,%r8 |
| jmp .Lprocess_extra_hash_loop |
| process_extra_in_trailer: |
| andq $15,%rcx |
| movq %rcx,%rbx |
| jz .Ldo_length_block |
| leaq -1(%rsi,%rcx,1),%rsi |
| |
| .Lprocess_extra_in_trailer_load: |
| pslldq $1,%xmm15 |
| pinsrb $0,(%rsi),%xmm15 |
| leaq -1(%rsi),%rsi |
| subq $1,%rcx |
| jnz .Lprocess_extra_in_trailer_load |
| |
| .Lprocess_partial_block: |
| |
| leaq .Land_masks(%rip),%r15 |
| shlq $4,%rbx |
| pand -16(%r15,%rbx,1),%xmm15 |
| .byte 102,77,15,126,253 |
| pextrq $1,%xmm15,%r14 |
| addq %r13,%r10 |
| adcq %r14,%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| |
| .Ldo_length_block: |
| addq 0+0+32(%rbp),%r10 |
| adcq 8+0+32(%rbp),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| |
| movq %r10,%r13 |
| movq %r11,%r14 |
| movq %r12,%r15 |
| subq $-5,%r10 |
| sbbq $-1,%r11 |
| sbbq $3,%r12 |
| cmovcq %r13,%r10 |
| cmovcq %r14,%r11 |
| cmovcq %r15,%r12 |
| |
| addq 0+0+16(%rbp),%r10 |
| adcq 8+0+16(%rbp),%r11 |
| |
| .cfi_remember_state |
| addq $288 + 0 + 32,%rsp |
| .cfi_adjust_cfa_offset -(288 + 32) |
| |
| popq %r9 |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %r9 |
| movq %r10,(%r9) |
| movq %r11,8(%r9) |
| popq %r15 |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %r15 |
| popq %r14 |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %r14 |
| popq %r13 |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %r13 |
| popq %r12 |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %r12 |
| popq %rbx |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %rbx |
| popq %rbp |
| .cfi_adjust_cfa_offset -8 |
| .cfi_restore %rbp |
| ret |
| |
| .Lseal_sse_128: |
| .cfi_restore_state |
| movdqu .Lchacha20_consts(%rip),%xmm0 |
| movdqa %xmm0,%xmm1 |
| movdqa %xmm0,%xmm2 |
| movdqu 0(%r9),%xmm4 |
| movdqa %xmm4,%xmm5 |
| movdqa %xmm4,%xmm6 |
| movdqu 16(%r9),%xmm8 |
| movdqa %xmm8,%xmm9 |
| movdqa %xmm8,%xmm10 |
| movdqu 32(%r9),%xmm14 |
| movdqa %xmm14,%xmm12 |
| paddd .Lsse_inc(%rip),%xmm12 |
| movdqa %xmm12,%xmm13 |
| paddd .Lsse_inc(%rip),%xmm13 |
| movdqa %xmm4,%xmm7 |
| movdqa %xmm8,%xmm11 |
| movdqa %xmm12,%xmm15 |
| movq $10,%r10 |
| |
| .Lseal_sse_128_rounds: |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,4 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,12 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,4 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,12 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol16(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm6 |
| pxor %xmm3,%xmm6 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol8(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm6 |
| pxor %xmm3,%xmm6 |
| .byte 102,15,58,15,246,4 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,12 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol16(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm4 |
| pxor %xmm3,%xmm4 |
| paddd %xmm4,%xmm0 |
| pxor %xmm0,%xmm12 |
| pshufb .Lrol8(%rip),%xmm12 |
| paddd %xmm12,%xmm8 |
| pxor %xmm8,%xmm4 |
| movdqa %xmm4,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm4 |
| pxor %xmm3,%xmm4 |
| .byte 102,15,58,15,228,12 |
| .byte 102,69,15,58,15,192,8 |
| .byte 102,69,15,58,15,228,4 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol16(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm5 |
| pxor %xmm3,%xmm5 |
| paddd %xmm5,%xmm1 |
| pxor %xmm1,%xmm13 |
| pshufb .Lrol8(%rip),%xmm13 |
| paddd %xmm13,%xmm9 |
| pxor %xmm9,%xmm5 |
| movdqa %xmm5,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm5 |
| pxor %xmm3,%xmm5 |
| .byte 102,15,58,15,237,12 |
| .byte 102,69,15,58,15,201,8 |
| .byte 102,69,15,58,15,237,4 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol16(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $12,%xmm3 |
| psrld $20,%xmm6 |
| pxor %xmm3,%xmm6 |
| paddd %xmm6,%xmm2 |
| pxor %xmm2,%xmm14 |
| pshufb .Lrol8(%rip),%xmm14 |
| paddd %xmm14,%xmm10 |
| pxor %xmm10,%xmm6 |
| movdqa %xmm6,%xmm3 |
| pslld $7,%xmm3 |
| psrld $25,%xmm6 |
| pxor %xmm3,%xmm6 |
| .byte 102,15,58,15,246,12 |
| .byte 102,69,15,58,15,210,8 |
| .byte 102,69,15,58,15,246,4 |
| |
| decq %r10 |
| jnz .Lseal_sse_128_rounds |
| paddd .Lchacha20_consts(%rip),%xmm0 |
| paddd .Lchacha20_consts(%rip),%xmm1 |
| paddd .Lchacha20_consts(%rip),%xmm2 |
| paddd %xmm7,%xmm4 |
| paddd %xmm7,%xmm5 |
| paddd %xmm7,%xmm6 |
| paddd %xmm11,%xmm8 |
| paddd %xmm11,%xmm9 |
| paddd %xmm15,%xmm12 |
| paddd .Lsse_inc(%rip),%xmm15 |
| paddd %xmm15,%xmm13 |
| |
| pand .Lclamp(%rip),%xmm2 |
| movdqa %xmm2,0+0(%rbp) |
| movdqa %xmm6,0+16(%rbp) |
| |
| movq %r8,%r8 |
| call poly_hash_ad_internal |
| jmp .Lseal_sse_128_tail_xor |
| .size chacha20_poly1305_seal_nohw, .-chacha20_poly1305_seal_nohw |
| .cfi_endproc |
| |
| |
| .globl chacha20_poly1305_open_avx2 |
| .hidden chacha20_poly1305_open_avx2 |
| .type chacha20_poly1305_open_avx2,@function |
| .align 64 |
| chacha20_poly1305_open_avx2: |
| .cfi_startproc |
| _CET_ENDBR |
| pushq %rbp |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %rbp,-16 |
| pushq %rbx |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %rbx,-24 |
| pushq %r12 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r12,-32 |
| pushq %r13 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r13,-40 |
| pushq %r14 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r14,-48 |
| pushq %r15 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r15,-56 |
| |
| |
| pushq %r9 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r9,-64 |
| subq $288 + 0 + 32,%rsp |
| .cfi_adjust_cfa_offset 288 + 32 |
| |
| leaq 32(%rsp),%rbp |
| andq $-32,%rbp |
| |
| movq %rdx,%rbx |
| movq %r8,0+0+32(%rbp) |
| movq %rbx,8+0+32(%rbp) |
| |
| vzeroupper |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vbroadcasti128 0(%r9),%ymm4 |
| vbroadcasti128 16(%r9),%ymm8 |
| vbroadcasti128 32(%r9),%ymm12 |
| vpaddd .Lavx2_init(%rip),%ymm12,%ymm12 |
| cmpq $192,%rbx |
| jbe .Lopen_avx2_192 |
| cmpq $320,%rbx |
| jbe .Lopen_avx2_320 |
| |
| vmovdqa %ymm4,0+64(%rbp) |
| vmovdqa %ymm8,0+96(%rbp) |
| vmovdqa %ymm12,0+160(%rbp) |
| movq $10,%r10 |
| .Lopen_avx2_init_rounds: |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| |
| decq %r10 |
| jne .Lopen_avx2_init_rounds |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 |
| |
| vpand .Lclamp(%rip),%ymm3,%ymm3 |
| vmovdqa %ymm3,0+0(%rbp) |
| |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 |
| |
| movq %r8,%r8 |
| call poly_hash_ad_internal |
| |
| xorq %rcx,%rcx |
| .Lopen_avx2_init_hash: |
| addq 0+0(%rsi,%rcx,1),%r10 |
| adcq 8+0(%rsi,%rcx,1),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| addq $16,%rcx |
| cmpq $64,%rcx |
| jne .Lopen_avx2_init_hash |
| |
| vpxor 0(%rsi),%ymm0,%ymm0 |
| vpxor 32(%rsi),%ymm4,%ymm4 |
| |
| vmovdqu %ymm0,0(%rdi) |
| vmovdqu %ymm4,32(%rdi) |
| leaq 64(%rsi),%rsi |
| leaq 64(%rdi),%rdi |
| subq $64,%rbx |
| .Lopen_avx2_main_loop: |
| |
| cmpq $512,%rbx |
| jb .Lopen_avx2_main_loop_done |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vmovdqa 0+64(%rbp),%ymm4 |
| vmovdqa 0+96(%rbp),%ymm8 |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm8,%ymm10 |
| vmovdqa %ymm0,%ymm3 |
| vmovdqa %ymm4,%ymm7 |
| vmovdqa %ymm8,%ymm11 |
| vmovdqa .Lavx2_inc(%rip),%ymm12 |
| vpaddd 0+160(%rbp),%ymm12,%ymm15 |
| vpaddd %ymm15,%ymm12,%ymm14 |
| vpaddd %ymm14,%ymm12,%ymm13 |
| vpaddd %ymm13,%ymm12,%ymm12 |
| vmovdqa %ymm15,0+256(%rbp) |
| vmovdqa %ymm14,0+224(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| vmovdqa %ymm12,0+160(%rbp) |
| |
| xorq %rcx,%rcx |
| .Lopen_avx2_main_loop_rounds: |
| addq 0+0(%rsi,%rcx,1),%r10 |
| adcq 8+0(%rsi,%rcx,1),%r11 |
| adcq $1,%r12 |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| addq 0+16(%rsi,%rcx,1),%r10 |
| adcq 8+16(%rsi,%rcx,1),%r11 |
| adcq $1,%r12 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $4,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $12,%ymm15,%ymm15,%ymm15 |
| vpalignr $4,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $12,%ymm14,%ymm14,%ymm14 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| addq 0+32(%rsi,%rcx,1),%r10 |
| adcq 8+32(%rsi,%rcx,1),%r11 |
| adcq $1,%r12 |
| |
| leaq 48(%rcx),%rcx |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $12,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $4,%ymm15,%ymm15,%ymm15 |
| vpalignr $12,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $4,%ymm14,%ymm14,%ymm14 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| |
| cmpq $60*8,%rcx |
| jne .Lopen_avx2_main_loop_rounds |
| vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 |
| vpaddd 0+64(%rbp),%ymm7,%ymm7 |
| vpaddd 0+96(%rbp),%ymm11,%ymm11 |
| vpaddd 0+256(%rbp),%ymm15,%ymm15 |
| vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 |
| vpaddd 0+64(%rbp),%ymm6,%ymm6 |
| vpaddd 0+96(%rbp),%ymm10,%ymm10 |
| vpaddd 0+224(%rbp),%ymm14,%ymm14 |
| vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 |
| vpaddd 0+64(%rbp),%ymm5,%ymm5 |
| vpaddd 0+96(%rbp),%ymm9,%ymm9 |
| vpaddd 0+192(%rbp),%ymm13,%ymm13 |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| |
| vmovdqa %ymm0,0+128(%rbp) |
| addq 0+60*8(%rsi),%r10 |
| adcq 8+60*8(%rsi),%r11 |
| adcq $1,%r12 |
| vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 |
| vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 |
| vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 |
| vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 |
| vpxor 0+0(%rsi),%ymm0,%ymm0 |
| vpxor 32+0(%rsi),%ymm3,%ymm3 |
| vpxor 64+0(%rsi),%ymm7,%ymm7 |
| vpxor 96+0(%rsi),%ymm11,%ymm11 |
| vmovdqu %ymm0,0+0(%rdi) |
| vmovdqu %ymm3,32+0(%rdi) |
| vmovdqu %ymm7,64+0(%rdi) |
| vmovdqu %ymm11,96+0(%rdi) |
| |
| vmovdqa 0+128(%rbp),%ymm0 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 |
| vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 |
| vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 |
| vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 |
| vpxor 0+128(%rsi),%ymm3,%ymm3 |
| vpxor 32+128(%rsi),%ymm2,%ymm2 |
| vpxor 64+128(%rsi),%ymm6,%ymm6 |
| vpxor 96+128(%rsi),%ymm10,%ymm10 |
| vmovdqu %ymm3,0+128(%rdi) |
| vmovdqu %ymm2,32+128(%rdi) |
| vmovdqu %ymm6,64+128(%rdi) |
| vmovdqu %ymm10,96+128(%rdi) |
| addq 0+60*8+16(%rsi),%r10 |
| adcq 8+60*8+16(%rsi),%r11 |
| adcq $1,%r12 |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 |
| vpxor 0+256(%rsi),%ymm3,%ymm3 |
| vpxor 32+256(%rsi),%ymm1,%ymm1 |
| vpxor 64+256(%rsi),%ymm5,%ymm5 |
| vpxor 96+256(%rsi),%ymm9,%ymm9 |
| vmovdqu %ymm3,0+256(%rdi) |
| vmovdqu %ymm1,32+256(%rdi) |
| vmovdqu %ymm5,64+256(%rdi) |
| vmovdqu %ymm9,96+256(%rdi) |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 |
| vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 |
| vpxor 0+384(%rsi),%ymm3,%ymm3 |
| vpxor 32+384(%rsi),%ymm0,%ymm0 |
| vpxor 64+384(%rsi),%ymm4,%ymm4 |
| vpxor 96+384(%rsi),%ymm8,%ymm8 |
| vmovdqu %ymm3,0+384(%rdi) |
| vmovdqu %ymm0,32+384(%rdi) |
| vmovdqu %ymm4,64+384(%rdi) |
| vmovdqu %ymm8,96+384(%rdi) |
| |
| leaq 512(%rsi),%rsi |
| leaq 512(%rdi),%rdi |
| subq $512,%rbx |
| jmp .Lopen_avx2_main_loop |
| .Lopen_avx2_main_loop_done: |
| testq %rbx,%rbx |
| vzeroupper |
| je .Lopen_sse_finalize |
| |
| cmpq $384,%rbx |
| ja .Lopen_avx2_tail_512 |
| cmpq $256,%rbx |
| ja .Lopen_avx2_tail_384 |
| cmpq $128,%rbx |
| ja .Lopen_avx2_tail_256 |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vmovdqa 0+64(%rbp),%ymm4 |
| vmovdqa 0+96(%rbp),%ymm8 |
| vmovdqa .Lavx2_inc(%rip),%ymm12 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| vmovdqa %ymm12,0+160(%rbp) |
| |
| xorq %r8,%r8 |
| movq %rbx,%rcx |
| andq $-16,%rcx |
| testq %rcx,%rcx |
| je .Lopen_avx2_tail_128_rounds |
| .Lopen_avx2_tail_128_rounds_and_x1hash: |
| addq 0+0(%rsi,%r8,1),%r10 |
| adcq 8+0(%rsi,%r8,1),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| .Lopen_avx2_tail_128_rounds: |
| addq $16,%r8 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| |
| cmpq %rcx,%r8 |
| jb .Lopen_avx2_tail_128_rounds_and_x1hash |
| cmpq $160,%r8 |
| jne .Lopen_avx2_tail_128_rounds |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 |
| vmovdqa %ymm3,%ymm8 |
| |
| jmp .Lopen_avx2_tail_128_xor |
| |
| .Lopen_avx2_tail_256: |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vmovdqa 0+64(%rbp),%ymm4 |
| vmovdqa 0+96(%rbp),%ymm8 |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa .Lavx2_inc(%rip),%ymm12 |
| vpaddd 0+160(%rbp),%ymm12,%ymm13 |
| vpaddd %ymm13,%ymm12,%ymm12 |
| vmovdqa %ymm12,0+160(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| |
| movq %rbx,0+128(%rbp) |
| movq %rbx,%rcx |
| subq $128,%rcx |
| shrq $4,%rcx |
| movq $10,%r8 |
| cmpq $10,%rcx |
| cmovgq %r8,%rcx |
| movq %rsi,%rbx |
| xorq %r8,%r8 |
| .Lopen_avx2_tail_256_rounds_and_x1hash: |
| addq 0+0(%rbx),%r10 |
| adcq 8+0(%rbx),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rbx),%rbx |
| .Lopen_avx2_tail_256_rounds: |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| |
| incq %r8 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol16(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpsrld $20,%ymm6,%ymm3 |
| vpslld $12,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol8(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpslld $7,%ymm6,%ymm3 |
| vpsrld $25,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpalignr $4,%ymm14,%ymm14,%ymm14 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $12,%ymm6,%ymm6,%ymm6 |
| |
| cmpq %rcx,%r8 |
| jb .Lopen_avx2_tail_256_rounds_and_x1hash |
| cmpq $10,%r8 |
| jne .Lopen_avx2_tail_256_rounds |
| movq %rbx,%r8 |
| subq %rsi,%rbx |
| movq %rbx,%rcx |
| movq 0+128(%rbp),%rbx |
| .Lopen_avx2_tail_256_hash: |
| addq $16,%rcx |
| cmpq %rbx,%rcx |
| jg .Lopen_avx2_tail_256_done |
| addq 0+0(%r8),%r10 |
| adcq 8+0(%r8),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%r8),%r8 |
| jmp .Lopen_avx2_tail_256_hash |
| .Lopen_avx2_tail_256_done: |
| vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 |
| vpaddd 0+64(%rbp),%ymm5,%ymm5 |
| vpaddd 0+96(%rbp),%ymm9,%ymm9 |
| vpaddd 0+192(%rbp),%ymm13,%ymm13 |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 |
| vpxor 0+0(%rsi),%ymm3,%ymm3 |
| vpxor 32+0(%rsi),%ymm1,%ymm1 |
| vpxor 64+0(%rsi),%ymm5,%ymm5 |
| vpxor 96+0(%rsi),%ymm9,%ymm9 |
| vmovdqu %ymm3,0+0(%rdi) |
| vmovdqu %ymm1,32+0(%rdi) |
| vmovdqu %ymm5,64+0(%rdi) |
| vmovdqu %ymm9,96+0(%rdi) |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 |
| vmovdqa %ymm3,%ymm8 |
| |
| leaq 128(%rsi),%rsi |
| leaq 128(%rdi),%rdi |
| subq $128,%rbx |
| jmp .Lopen_avx2_tail_128_xor |
| |
| .Lopen_avx2_tail_384: |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vmovdqa 0+64(%rbp),%ymm4 |
| vmovdqa 0+96(%rbp),%ymm8 |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm8,%ymm10 |
| vmovdqa .Lavx2_inc(%rip),%ymm12 |
| vpaddd 0+160(%rbp),%ymm12,%ymm14 |
| vpaddd %ymm14,%ymm12,%ymm13 |
| vpaddd %ymm13,%ymm12,%ymm12 |
| vmovdqa %ymm12,0+160(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| vmovdqa %ymm14,0+224(%rbp) |
| |
| movq %rbx,0+128(%rbp) |
| movq %rbx,%rcx |
| subq $256,%rcx |
| shrq $4,%rcx |
| addq $6,%rcx |
| movq $10,%r8 |
| cmpq $10,%rcx |
| cmovgq %r8,%rcx |
| movq %rsi,%rbx |
| xorq %r8,%r8 |
| .Lopen_avx2_tail_384_rounds_and_x2hash: |
| addq 0+0(%rbx),%r10 |
| adcq 8+0(%rbx),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rbx),%rbx |
| .Lopen_avx2_tail_384_rounds_and_x1hash: |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol16(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpsrld $20,%ymm6,%ymm3 |
| vpslld $12,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol8(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpslld $7,%ymm6,%ymm3 |
| vpsrld $25,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpalignr $12,%ymm14,%ymm14,%ymm14 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $4,%ymm6,%ymm6,%ymm6 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| addq 0+0(%rbx),%r10 |
| adcq 8+0(%rbx),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rbx),%rbx |
| incq %r8 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol16(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpsrld $20,%ymm6,%ymm3 |
| vpslld $12,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol8(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpslld $7,%ymm6,%ymm3 |
| vpsrld $25,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpalignr $4,%ymm14,%ymm14,%ymm14 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $12,%ymm6,%ymm6,%ymm6 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| |
| cmpq %rcx,%r8 |
| jb .Lopen_avx2_tail_384_rounds_and_x2hash |
| cmpq $10,%r8 |
| jne .Lopen_avx2_tail_384_rounds_and_x1hash |
| movq %rbx,%r8 |
| subq %rsi,%rbx |
| movq %rbx,%rcx |
| movq 0+128(%rbp),%rbx |
| .Lopen_avx2_384_tail_hash: |
| addq $16,%rcx |
| cmpq %rbx,%rcx |
| jg .Lopen_avx2_384_tail_done |
| addq 0+0(%r8),%r10 |
| adcq 8+0(%r8),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%r8),%r8 |
| jmp .Lopen_avx2_384_tail_hash |
| .Lopen_avx2_384_tail_done: |
| vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 |
| vpaddd 0+64(%rbp),%ymm6,%ymm6 |
| vpaddd 0+96(%rbp),%ymm10,%ymm10 |
| vpaddd 0+224(%rbp),%ymm14,%ymm14 |
| vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 |
| vpaddd 0+64(%rbp),%ymm5,%ymm5 |
| vpaddd 0+96(%rbp),%ymm9,%ymm9 |
| vpaddd 0+192(%rbp),%ymm13,%ymm13 |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 |
| vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 |
| vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 |
| vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 |
| vpxor 0+0(%rsi),%ymm3,%ymm3 |
| vpxor 32+0(%rsi),%ymm2,%ymm2 |
| vpxor 64+0(%rsi),%ymm6,%ymm6 |
| vpxor 96+0(%rsi),%ymm10,%ymm10 |
| vmovdqu %ymm3,0+0(%rdi) |
| vmovdqu %ymm2,32+0(%rdi) |
| vmovdqu %ymm6,64+0(%rdi) |
| vmovdqu %ymm10,96+0(%rdi) |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 |
| vpxor 0+128(%rsi),%ymm3,%ymm3 |
| vpxor 32+128(%rsi),%ymm1,%ymm1 |
| vpxor 64+128(%rsi),%ymm5,%ymm5 |
| vpxor 96+128(%rsi),%ymm9,%ymm9 |
| vmovdqu %ymm3,0+128(%rdi) |
| vmovdqu %ymm1,32+128(%rdi) |
| vmovdqu %ymm5,64+128(%rdi) |
| vmovdqu %ymm9,96+128(%rdi) |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 |
| vmovdqa %ymm3,%ymm8 |
| |
| leaq 256(%rsi),%rsi |
| leaq 256(%rdi),%rdi |
| subq $256,%rbx |
| jmp .Lopen_avx2_tail_128_xor |
| |
| .Lopen_avx2_tail_512: |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vmovdqa 0+64(%rbp),%ymm4 |
| vmovdqa 0+96(%rbp),%ymm8 |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm8,%ymm10 |
| vmovdqa %ymm0,%ymm3 |
| vmovdqa %ymm4,%ymm7 |
| vmovdqa %ymm8,%ymm11 |
| vmovdqa .Lavx2_inc(%rip),%ymm12 |
| vpaddd 0+160(%rbp),%ymm12,%ymm15 |
| vpaddd %ymm15,%ymm12,%ymm14 |
| vpaddd %ymm14,%ymm12,%ymm13 |
| vpaddd %ymm13,%ymm12,%ymm12 |
| vmovdqa %ymm15,0+256(%rbp) |
| vmovdqa %ymm14,0+224(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| vmovdqa %ymm12,0+160(%rbp) |
| |
| xorq %rcx,%rcx |
| movq %rsi,%r8 |
| .Lopen_avx2_tail_512_rounds_and_x2hash: |
| addq 0+0(%r8),%r10 |
| adcq 8+0(%r8),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%r8),%r8 |
| .Lopen_avx2_tail_512_rounds_and_x1hash: |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| addq 0+0(%r8),%r10 |
| adcq 8+0(%r8),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $4,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $12,%ymm15,%ymm15,%ymm15 |
| vpalignr $4,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $12,%ymm14,%ymm14,%ymm14 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| addq 0+16(%r8),%r10 |
| adcq 8+16(%r8),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 32(%r8),%r8 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $12,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $4,%ymm15,%ymm15,%ymm15 |
| vpalignr $12,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $4,%ymm14,%ymm14,%ymm14 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| |
| incq %rcx |
| cmpq $4,%rcx |
| jl .Lopen_avx2_tail_512_rounds_and_x2hash |
| cmpq $10,%rcx |
| jne .Lopen_avx2_tail_512_rounds_and_x1hash |
| movq %rbx,%rcx |
| subq $384,%rcx |
| andq $-16,%rcx |
| .Lopen_avx2_tail_512_hash: |
| testq %rcx,%rcx |
| je .Lopen_avx2_tail_512_done |
| addq 0+0(%r8),%r10 |
| adcq 8+0(%r8),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%r8),%r8 |
| subq $16,%rcx |
| jmp .Lopen_avx2_tail_512_hash |
| .Lopen_avx2_tail_512_done: |
| vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 |
| vpaddd 0+64(%rbp),%ymm7,%ymm7 |
| vpaddd 0+96(%rbp),%ymm11,%ymm11 |
| vpaddd 0+256(%rbp),%ymm15,%ymm15 |
| vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 |
| vpaddd 0+64(%rbp),%ymm6,%ymm6 |
| vpaddd 0+96(%rbp),%ymm10,%ymm10 |
| vpaddd 0+224(%rbp),%ymm14,%ymm14 |
| vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 |
| vpaddd 0+64(%rbp),%ymm5,%ymm5 |
| vpaddd 0+96(%rbp),%ymm9,%ymm9 |
| vpaddd 0+192(%rbp),%ymm13,%ymm13 |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| |
| vmovdqa %ymm0,0+128(%rbp) |
| vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 |
| vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 |
| vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 |
| vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 |
| vpxor 0+0(%rsi),%ymm0,%ymm0 |
| vpxor 32+0(%rsi),%ymm3,%ymm3 |
| vpxor 64+0(%rsi),%ymm7,%ymm7 |
| vpxor 96+0(%rsi),%ymm11,%ymm11 |
| vmovdqu %ymm0,0+0(%rdi) |
| vmovdqu %ymm3,32+0(%rdi) |
| vmovdqu %ymm7,64+0(%rdi) |
| vmovdqu %ymm11,96+0(%rdi) |
| |
| vmovdqa 0+128(%rbp),%ymm0 |
| vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 |
| vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 |
| vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 |
| vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 |
| vpxor 0+128(%rsi),%ymm3,%ymm3 |
| vpxor 32+128(%rsi),%ymm2,%ymm2 |
| vpxor 64+128(%rsi),%ymm6,%ymm6 |
| vpxor 96+128(%rsi),%ymm10,%ymm10 |
| vmovdqu %ymm3,0+128(%rdi) |
| vmovdqu %ymm2,32+128(%rdi) |
| vmovdqu %ymm6,64+128(%rdi) |
| vmovdqu %ymm10,96+128(%rdi) |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 |
| vpxor 0+256(%rsi),%ymm3,%ymm3 |
| vpxor 32+256(%rsi),%ymm1,%ymm1 |
| vpxor 64+256(%rsi),%ymm5,%ymm5 |
| vpxor 96+256(%rsi),%ymm9,%ymm9 |
| vmovdqu %ymm3,0+256(%rdi) |
| vmovdqu %ymm1,32+256(%rdi) |
| vmovdqu %ymm5,64+256(%rdi) |
| vmovdqu %ymm9,96+256(%rdi) |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 |
| vmovdqa %ymm3,%ymm8 |
| |
| leaq 384(%rsi),%rsi |
| leaq 384(%rdi),%rdi |
| subq $384,%rbx |
| .Lopen_avx2_tail_128_xor: |
| cmpq $32,%rbx |
| jb .Lopen_avx2_tail_32_xor |
| subq $32,%rbx |
| vpxor (%rsi),%ymm0,%ymm0 |
| vmovdqu %ymm0,(%rdi) |
| leaq 32(%rsi),%rsi |
| leaq 32(%rdi),%rdi |
| vmovdqa %ymm4,%ymm0 |
| vmovdqa %ymm8,%ymm4 |
| vmovdqa %ymm12,%ymm8 |
| jmp .Lopen_avx2_tail_128_xor |
| .Lopen_avx2_tail_32_xor: |
| cmpq $16,%rbx |
| vmovdqa %xmm0,%xmm1 |
| jb .Lopen_avx2_exit |
| subq $16,%rbx |
| |
| vpxor (%rsi),%xmm0,%xmm1 |
| vmovdqu %xmm1,(%rdi) |
| leaq 16(%rsi),%rsi |
| leaq 16(%rdi),%rdi |
| vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 |
| vmovdqa %xmm0,%xmm1 |
| .Lopen_avx2_exit: |
| vzeroupper |
| jmp .Lopen_sse_tail_16 |
| |
| .Lopen_avx2_192: |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm8,%ymm10 |
| vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 |
| vmovdqa %ymm12,%ymm11 |
| vmovdqa %ymm13,%ymm15 |
| movq $10,%r10 |
| .Lopen_avx2_192_rounds: |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| |
| decq %r10 |
| jne .Lopen_avx2_192_rounds |
| vpaddd %ymm2,%ymm0,%ymm0 |
| vpaddd %ymm2,%ymm1,%ymm1 |
| vpaddd %ymm6,%ymm4,%ymm4 |
| vpaddd %ymm6,%ymm5,%ymm5 |
| vpaddd %ymm10,%ymm8,%ymm8 |
| vpaddd %ymm10,%ymm9,%ymm9 |
| vpaddd %ymm11,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm13,%ymm13 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 |
| |
| vpand .Lclamp(%rip),%ymm3,%ymm3 |
| vmovdqa %ymm3,0+0(%rbp) |
| |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 |
| .Lopen_avx2_short: |
| movq %r8,%r8 |
| call poly_hash_ad_internal |
| .Lopen_avx2_short_hash_and_xor_loop: |
| cmpq $32,%rbx |
| jb .Lopen_avx2_short_tail_32 |
| subq $32,%rbx |
| addq 0+0(%rsi),%r10 |
| adcq 8+0(%rsi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| addq 0+16(%rsi),%r10 |
| adcq 8+16(%rsi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| |
| vpxor (%rsi),%ymm0,%ymm0 |
| vmovdqu %ymm0,(%rdi) |
| leaq 32(%rsi),%rsi |
| leaq 32(%rdi),%rdi |
| |
| vmovdqa %ymm4,%ymm0 |
| vmovdqa %ymm8,%ymm4 |
| vmovdqa %ymm12,%ymm8 |
| vmovdqa %ymm1,%ymm12 |
| vmovdqa %ymm5,%ymm1 |
| vmovdqa %ymm9,%ymm5 |
| vmovdqa %ymm13,%ymm9 |
| vmovdqa %ymm2,%ymm13 |
| vmovdqa %ymm6,%ymm2 |
| jmp .Lopen_avx2_short_hash_and_xor_loop |
| .Lopen_avx2_short_tail_32: |
| cmpq $16,%rbx |
| vmovdqa %xmm0,%xmm1 |
| jb .Lopen_avx2_short_tail_32_exit |
| subq $16,%rbx |
| addq 0+0(%rsi),%r10 |
| adcq 8+0(%rsi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| vpxor (%rsi),%xmm0,%xmm3 |
| vmovdqu %xmm3,(%rdi) |
| leaq 16(%rsi),%rsi |
| leaq 16(%rdi),%rdi |
| vextracti128 $1,%ymm0,%xmm1 |
| .Lopen_avx2_short_tail_32_exit: |
| vzeroupper |
| jmp .Lopen_sse_tail_16 |
| |
| .Lopen_avx2_320: |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm8,%ymm10 |
| vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 |
| vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14 |
| vmovdqa %ymm4,%ymm7 |
| vmovdqa %ymm8,%ymm11 |
| vmovdqa %ymm12,0+160(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| vmovdqa %ymm14,0+224(%rbp) |
| movq $10,%r10 |
| .Lopen_avx2_320_rounds: |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol16(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpsrld $20,%ymm6,%ymm3 |
| vpslld $12,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol8(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpslld $7,%ymm6,%ymm3 |
| vpsrld $25,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpalignr $12,%ymm14,%ymm14,%ymm14 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $4,%ymm6,%ymm6,%ymm6 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol16(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpsrld $20,%ymm6,%ymm3 |
| vpslld $12,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol8(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpslld $7,%ymm6,%ymm3 |
| vpsrld $25,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpalignr $4,%ymm14,%ymm14,%ymm14 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $12,%ymm6,%ymm6,%ymm6 |
| |
| decq %r10 |
| jne .Lopen_avx2_320_rounds |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 |
| vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 |
| vpaddd %ymm7,%ymm4,%ymm4 |
| vpaddd %ymm7,%ymm5,%ymm5 |
| vpaddd %ymm7,%ymm6,%ymm6 |
| vpaddd %ymm11,%ymm8,%ymm8 |
| vpaddd %ymm11,%ymm9,%ymm9 |
| vpaddd %ymm11,%ymm10,%ymm10 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| vpaddd 0+192(%rbp),%ymm13,%ymm13 |
| vpaddd 0+224(%rbp),%ymm14,%ymm14 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 |
| |
| vpand .Lclamp(%rip),%ymm3,%ymm3 |
| vmovdqa %ymm3,0+0(%rbp) |
| |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 |
| vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 |
| vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 |
| vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 |
| vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 |
| jmp .Lopen_avx2_short |
| .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 |
| .cfi_endproc |
| |
| |
| .globl chacha20_poly1305_seal_avx2 |
| .hidden chacha20_poly1305_seal_avx2 |
| .type chacha20_poly1305_seal_avx2,@function |
| .align 64 |
| chacha20_poly1305_seal_avx2: |
| .cfi_startproc |
| _CET_ENDBR |
| pushq %rbp |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %rbp,-16 |
| pushq %rbx |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %rbx,-24 |
| pushq %r12 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r12,-32 |
| pushq %r13 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r13,-40 |
| pushq %r14 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r14,-48 |
| pushq %r15 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r15,-56 |
| |
| |
| pushq %r9 |
| .cfi_adjust_cfa_offset 8 |
| .cfi_offset %r9,-64 |
| subq $288 + 0 + 32,%rsp |
| .cfi_adjust_cfa_offset 288 + 32 |
| leaq 32(%rsp),%rbp |
| andq $-32,%rbp |
| |
| movq 56(%r9),%rbx |
| addq %rdx,%rbx |
| movq %r8,0+0+32(%rbp) |
| movq %rbx,8+0+32(%rbp) |
| movq %rdx,%rbx |
| |
| vzeroupper |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vbroadcasti128 0(%r9),%ymm4 |
| vbroadcasti128 16(%r9),%ymm8 |
| vbroadcasti128 32(%r9),%ymm12 |
| vpaddd .Lavx2_init(%rip),%ymm12,%ymm12 |
| cmpq $192,%rbx |
| jbe .Lseal_avx2_192 |
| cmpq $320,%rbx |
| jbe .Lseal_avx2_320 |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm0,%ymm3 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm4,%ymm7 |
| vmovdqa %ymm4,0+64(%rbp) |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm8,%ymm10 |
| vmovdqa %ymm8,%ymm11 |
| vmovdqa %ymm8,0+96(%rbp) |
| vmovdqa %ymm12,%ymm15 |
| vpaddd .Lavx2_inc(%rip),%ymm15,%ymm14 |
| vpaddd .Lavx2_inc(%rip),%ymm14,%ymm13 |
| vpaddd .Lavx2_inc(%rip),%ymm13,%ymm12 |
| vmovdqa %ymm12,0+160(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| vmovdqa %ymm14,0+224(%rbp) |
| vmovdqa %ymm15,0+256(%rbp) |
| movq $10,%r10 |
| .Lseal_avx2_init_rounds: |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $4,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $12,%ymm15,%ymm15,%ymm15 |
| vpalignr $4,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $12,%ymm14,%ymm14,%ymm14 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $12,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $4,%ymm15,%ymm15,%ymm15 |
| vpalignr $12,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $4,%ymm14,%ymm14,%ymm14 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| |
| decq %r10 |
| jnz .Lseal_avx2_init_rounds |
| vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 |
| vpaddd 0+64(%rbp),%ymm7,%ymm7 |
| vpaddd 0+96(%rbp),%ymm11,%ymm11 |
| vpaddd 0+256(%rbp),%ymm15,%ymm15 |
| vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 |
| vpaddd 0+64(%rbp),%ymm6,%ymm6 |
| vpaddd 0+96(%rbp),%ymm10,%ymm10 |
| vpaddd 0+224(%rbp),%ymm14,%ymm14 |
| vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 |
| vpaddd 0+64(%rbp),%ymm5,%ymm5 |
| vpaddd 0+96(%rbp),%ymm9,%ymm9 |
| vpaddd 0+192(%rbp),%ymm13,%ymm13 |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| |
| vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 |
| vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 |
| vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 |
| vpand .Lclamp(%rip),%ymm15,%ymm15 |
| vmovdqa %ymm15,0+0(%rbp) |
| movq %r8,%r8 |
| call poly_hash_ad_internal |
| |
| vpxor 0(%rsi),%ymm3,%ymm3 |
| vpxor 32(%rsi),%ymm11,%ymm11 |
| vmovdqu %ymm3,0(%rdi) |
| vmovdqu %ymm11,32(%rdi) |
| vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 |
| vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 |
| vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 |
| vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 |
| vpxor 0+64(%rsi),%ymm15,%ymm15 |
| vpxor 32+64(%rsi),%ymm2,%ymm2 |
| vpxor 64+64(%rsi),%ymm6,%ymm6 |
| vpxor 96+64(%rsi),%ymm10,%ymm10 |
| vmovdqu %ymm15,0+64(%rdi) |
| vmovdqu %ymm2,32+64(%rdi) |
| vmovdqu %ymm6,64+64(%rdi) |
| vmovdqu %ymm10,96+64(%rdi) |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 |
| vpxor 0+192(%rsi),%ymm15,%ymm15 |
| vpxor 32+192(%rsi),%ymm1,%ymm1 |
| vpxor 64+192(%rsi),%ymm5,%ymm5 |
| vpxor 96+192(%rsi),%ymm9,%ymm9 |
| vmovdqu %ymm15,0+192(%rdi) |
| vmovdqu %ymm1,32+192(%rdi) |
| vmovdqu %ymm5,64+192(%rdi) |
| vmovdqu %ymm9,96+192(%rdi) |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 |
| vmovdqa %ymm15,%ymm8 |
| |
| leaq 320(%rsi),%rsi |
| subq $320,%rbx |
| movq $320,%rcx |
| cmpq $128,%rbx |
| jbe .Lseal_avx2_short_hash_remainder |
| vpxor 0(%rsi),%ymm0,%ymm0 |
| vpxor 32(%rsi),%ymm4,%ymm4 |
| vpxor 64(%rsi),%ymm8,%ymm8 |
| vpxor 96(%rsi),%ymm12,%ymm12 |
| vmovdqu %ymm0,320(%rdi) |
| vmovdqu %ymm4,352(%rdi) |
| vmovdqu %ymm8,384(%rdi) |
| vmovdqu %ymm12,416(%rdi) |
| leaq 128(%rsi),%rsi |
| subq $128,%rbx |
| movq $8,%rcx |
| movq $2,%r8 |
| cmpq $128,%rbx |
| jbe .Lseal_avx2_tail_128 |
| cmpq $256,%rbx |
| jbe .Lseal_avx2_tail_256 |
| cmpq $384,%rbx |
| jbe .Lseal_avx2_tail_384 |
| cmpq $512,%rbx |
| jbe .Lseal_avx2_tail_512 |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vmovdqa 0+64(%rbp),%ymm4 |
| vmovdqa 0+96(%rbp),%ymm8 |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm8,%ymm10 |
| vmovdqa %ymm0,%ymm3 |
| vmovdqa %ymm4,%ymm7 |
| vmovdqa %ymm8,%ymm11 |
| vmovdqa .Lavx2_inc(%rip),%ymm12 |
| vpaddd 0+160(%rbp),%ymm12,%ymm15 |
| vpaddd %ymm15,%ymm12,%ymm14 |
| vpaddd %ymm14,%ymm12,%ymm13 |
| vpaddd %ymm13,%ymm12,%ymm12 |
| vmovdqa %ymm15,0+256(%rbp) |
| vmovdqa %ymm14,0+224(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| vmovdqa %ymm12,0+160(%rbp) |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $4,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $12,%ymm15,%ymm15,%ymm15 |
| vpalignr $4,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $12,%ymm14,%ymm14,%ymm14 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $12,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $4,%ymm15,%ymm15,%ymm15 |
| vpalignr $12,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $4,%ymm14,%ymm14,%ymm14 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| |
| subq $16,%rdi |
| movq $9,%rcx |
| jmp .Lseal_avx2_main_loop_rounds_entry |
| .align 32 |
| .Lseal_avx2_main_loop: |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vmovdqa 0+64(%rbp),%ymm4 |
| vmovdqa 0+96(%rbp),%ymm8 |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm8,%ymm10 |
| vmovdqa %ymm0,%ymm3 |
| vmovdqa %ymm4,%ymm7 |
| vmovdqa %ymm8,%ymm11 |
| vmovdqa .Lavx2_inc(%rip),%ymm12 |
| vpaddd 0+160(%rbp),%ymm12,%ymm15 |
| vpaddd %ymm15,%ymm12,%ymm14 |
| vpaddd %ymm14,%ymm12,%ymm13 |
| vpaddd %ymm13,%ymm12,%ymm12 |
| vmovdqa %ymm15,0+256(%rbp) |
| vmovdqa %ymm14,0+224(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| vmovdqa %ymm12,0+160(%rbp) |
| |
| movq $10,%rcx |
| .align 32 |
| .Lseal_avx2_main_loop_rounds: |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| .Lseal_avx2_main_loop_rounds_entry: |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| addq 0+16(%rdi),%r10 |
| adcq 8+16(%rdi),%r11 |
| adcq $1,%r12 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $4,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $12,%ymm15,%ymm15,%ymm15 |
| vpalignr $4,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $12,%ymm14,%ymm14,%ymm14 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| addq 0+32(%rdi),%r10 |
| adcq 8+32(%rdi),%r11 |
| adcq $1,%r12 |
| |
| leaq 48(%rdi),%rdi |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $12,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $4,%ymm15,%ymm15,%ymm15 |
| vpalignr $12,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $4,%ymm14,%ymm14,%ymm14 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| |
| decq %rcx |
| jne .Lseal_avx2_main_loop_rounds |
| vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 |
| vpaddd 0+64(%rbp),%ymm7,%ymm7 |
| vpaddd 0+96(%rbp),%ymm11,%ymm11 |
| vpaddd 0+256(%rbp),%ymm15,%ymm15 |
| vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 |
| vpaddd 0+64(%rbp),%ymm6,%ymm6 |
| vpaddd 0+96(%rbp),%ymm10,%ymm10 |
| vpaddd 0+224(%rbp),%ymm14,%ymm14 |
| vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 |
| vpaddd 0+64(%rbp),%ymm5,%ymm5 |
| vpaddd 0+96(%rbp),%ymm9,%ymm9 |
| vpaddd 0+192(%rbp),%ymm13,%ymm13 |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| |
| vmovdqa %ymm0,0+128(%rbp) |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| addq 0+16(%rdi),%r10 |
| adcq 8+16(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 32(%rdi),%rdi |
| vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 |
| vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 |
| vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 |
| vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 |
| vpxor 0+0(%rsi),%ymm0,%ymm0 |
| vpxor 32+0(%rsi),%ymm3,%ymm3 |
| vpxor 64+0(%rsi),%ymm7,%ymm7 |
| vpxor 96+0(%rsi),%ymm11,%ymm11 |
| vmovdqu %ymm0,0+0(%rdi) |
| vmovdqu %ymm3,32+0(%rdi) |
| vmovdqu %ymm7,64+0(%rdi) |
| vmovdqu %ymm11,96+0(%rdi) |
| |
| vmovdqa 0+128(%rbp),%ymm0 |
| vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 |
| vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 |
| vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 |
| vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 |
| vpxor 0+128(%rsi),%ymm3,%ymm3 |
| vpxor 32+128(%rsi),%ymm2,%ymm2 |
| vpxor 64+128(%rsi),%ymm6,%ymm6 |
| vpxor 96+128(%rsi),%ymm10,%ymm10 |
| vmovdqu %ymm3,0+128(%rdi) |
| vmovdqu %ymm2,32+128(%rdi) |
| vmovdqu %ymm6,64+128(%rdi) |
| vmovdqu %ymm10,96+128(%rdi) |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 |
| vpxor 0+256(%rsi),%ymm3,%ymm3 |
| vpxor 32+256(%rsi),%ymm1,%ymm1 |
| vpxor 64+256(%rsi),%ymm5,%ymm5 |
| vpxor 96+256(%rsi),%ymm9,%ymm9 |
| vmovdqu %ymm3,0+256(%rdi) |
| vmovdqu %ymm1,32+256(%rdi) |
| vmovdqu %ymm5,64+256(%rdi) |
| vmovdqu %ymm9,96+256(%rdi) |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 |
| vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 |
| vpxor 0+384(%rsi),%ymm3,%ymm3 |
| vpxor 32+384(%rsi),%ymm0,%ymm0 |
| vpxor 64+384(%rsi),%ymm4,%ymm4 |
| vpxor 96+384(%rsi),%ymm8,%ymm8 |
| vmovdqu %ymm3,0+384(%rdi) |
| vmovdqu %ymm0,32+384(%rdi) |
| vmovdqu %ymm4,64+384(%rdi) |
| vmovdqu %ymm8,96+384(%rdi) |
| |
| leaq 512(%rsi),%rsi |
| subq $512,%rbx |
| cmpq $512,%rbx |
| jg .Lseal_avx2_main_loop |
| |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| addq 0+16(%rdi),%r10 |
| adcq 8+16(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 32(%rdi),%rdi |
| movq $10,%rcx |
| xorq %r8,%r8 |
| |
| cmpq $384,%rbx |
| ja .Lseal_avx2_tail_512 |
| cmpq $256,%rbx |
| ja .Lseal_avx2_tail_384 |
| cmpq $128,%rbx |
| ja .Lseal_avx2_tail_256 |
| |
| .Lseal_avx2_tail_128: |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vmovdqa 0+64(%rbp),%ymm4 |
| vmovdqa 0+96(%rbp),%ymm8 |
| vmovdqa .Lavx2_inc(%rip),%ymm12 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| vmovdqa %ymm12,0+160(%rbp) |
| |
| .Lseal_avx2_tail_128_rounds_and_3xhash: |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rdi),%rdi |
| .Lseal_avx2_tail_128_rounds_and_2xhash: |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| addq 0+16(%rdi),%r10 |
| adcq 8+16(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 32(%rdi),%rdi |
| decq %rcx |
| jg .Lseal_avx2_tail_128_rounds_and_3xhash |
| decq %r8 |
| jge .Lseal_avx2_tail_128_rounds_and_2xhash |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 |
| vmovdqa %ymm3,%ymm8 |
| |
| jmp .Lseal_avx2_short_loop |
| |
| .Lseal_avx2_tail_256: |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vmovdqa 0+64(%rbp),%ymm4 |
| vmovdqa 0+96(%rbp),%ymm8 |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa .Lavx2_inc(%rip),%ymm12 |
| vpaddd 0+160(%rbp),%ymm12,%ymm13 |
| vpaddd %ymm13,%ymm12,%ymm12 |
| vmovdqa %ymm12,0+160(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| |
| .Lseal_avx2_tail_256_rounds_and_3xhash: |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rdi),%rdi |
| .Lseal_avx2_tail_256_rounds_and_2xhash: |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| addq 0+16(%rdi),%r10 |
| adcq 8+16(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 32(%rdi),%rdi |
| decq %rcx |
| jg .Lseal_avx2_tail_256_rounds_and_3xhash |
| decq %r8 |
| jge .Lseal_avx2_tail_256_rounds_and_2xhash |
| vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 |
| vpaddd 0+64(%rbp),%ymm5,%ymm5 |
| vpaddd 0+96(%rbp),%ymm9,%ymm9 |
| vpaddd 0+192(%rbp),%ymm13,%ymm13 |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 |
| vpxor 0+0(%rsi),%ymm3,%ymm3 |
| vpxor 32+0(%rsi),%ymm1,%ymm1 |
| vpxor 64+0(%rsi),%ymm5,%ymm5 |
| vpxor 96+0(%rsi),%ymm9,%ymm9 |
| vmovdqu %ymm3,0+0(%rdi) |
| vmovdqu %ymm1,32+0(%rdi) |
| vmovdqu %ymm5,64+0(%rdi) |
| vmovdqu %ymm9,96+0(%rdi) |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 |
| vmovdqa %ymm3,%ymm8 |
| |
| movq $128,%rcx |
| leaq 128(%rsi),%rsi |
| subq $128,%rbx |
| jmp .Lseal_avx2_short_hash_remainder |
| |
| .Lseal_avx2_tail_384: |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vmovdqa 0+64(%rbp),%ymm4 |
| vmovdqa 0+96(%rbp),%ymm8 |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm8,%ymm10 |
| vmovdqa .Lavx2_inc(%rip),%ymm12 |
| vpaddd 0+160(%rbp),%ymm12,%ymm14 |
| vpaddd %ymm14,%ymm12,%ymm13 |
| vpaddd %ymm13,%ymm12,%ymm12 |
| vmovdqa %ymm12,0+160(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| vmovdqa %ymm14,0+224(%rbp) |
| |
| .Lseal_avx2_tail_384_rounds_and_3xhash: |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rdi),%rdi |
| .Lseal_avx2_tail_384_rounds_and_2xhash: |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol16(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpsrld $20,%ymm6,%ymm3 |
| vpslld $12,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol8(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpslld $7,%ymm6,%ymm3 |
| vpsrld $25,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpalignr $12,%ymm14,%ymm14,%ymm14 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $4,%ymm6,%ymm6,%ymm6 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| addq 0+16(%rdi),%r10 |
| adcq 8+16(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol16(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpsrld $20,%ymm6,%ymm3 |
| vpslld $12,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol8(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpslld $7,%ymm6,%ymm3 |
| vpsrld $25,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpalignr $4,%ymm14,%ymm14,%ymm14 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $12,%ymm6,%ymm6,%ymm6 |
| |
| leaq 32(%rdi),%rdi |
| decq %rcx |
| jg .Lseal_avx2_tail_384_rounds_and_3xhash |
| decq %r8 |
| jge .Lseal_avx2_tail_384_rounds_and_2xhash |
| vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 |
| vpaddd 0+64(%rbp),%ymm6,%ymm6 |
| vpaddd 0+96(%rbp),%ymm10,%ymm10 |
| vpaddd 0+224(%rbp),%ymm14,%ymm14 |
| vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 |
| vpaddd 0+64(%rbp),%ymm5,%ymm5 |
| vpaddd 0+96(%rbp),%ymm9,%ymm9 |
| vpaddd 0+192(%rbp),%ymm13,%ymm13 |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 |
| vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 |
| vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 |
| vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 |
| vpxor 0+0(%rsi),%ymm3,%ymm3 |
| vpxor 32+0(%rsi),%ymm2,%ymm2 |
| vpxor 64+0(%rsi),%ymm6,%ymm6 |
| vpxor 96+0(%rsi),%ymm10,%ymm10 |
| vmovdqu %ymm3,0+0(%rdi) |
| vmovdqu %ymm2,32+0(%rdi) |
| vmovdqu %ymm6,64+0(%rdi) |
| vmovdqu %ymm10,96+0(%rdi) |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 |
| vpxor 0+128(%rsi),%ymm3,%ymm3 |
| vpxor 32+128(%rsi),%ymm1,%ymm1 |
| vpxor 64+128(%rsi),%ymm5,%ymm5 |
| vpxor 96+128(%rsi),%ymm9,%ymm9 |
| vmovdqu %ymm3,0+128(%rdi) |
| vmovdqu %ymm1,32+128(%rdi) |
| vmovdqu %ymm5,64+128(%rdi) |
| vmovdqu %ymm9,96+128(%rdi) |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 |
| vmovdqa %ymm3,%ymm8 |
| |
| movq $256,%rcx |
| leaq 256(%rsi),%rsi |
| subq $256,%rbx |
| jmp .Lseal_avx2_short_hash_remainder |
| |
| .Lseal_avx2_tail_512: |
| vmovdqa .Lchacha20_consts(%rip),%ymm0 |
| vmovdqa 0+64(%rbp),%ymm4 |
| vmovdqa 0+96(%rbp),%ymm8 |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm8,%ymm10 |
| vmovdqa %ymm0,%ymm3 |
| vmovdqa %ymm4,%ymm7 |
| vmovdqa %ymm8,%ymm11 |
| vmovdqa .Lavx2_inc(%rip),%ymm12 |
| vpaddd 0+160(%rbp),%ymm12,%ymm15 |
| vpaddd %ymm15,%ymm12,%ymm14 |
| vpaddd %ymm14,%ymm12,%ymm13 |
| vpaddd %ymm13,%ymm12,%ymm12 |
| vmovdqa %ymm15,0+256(%rbp) |
| vmovdqa %ymm14,0+224(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| vmovdqa %ymm12,0+160(%rbp) |
| |
| .Lseal_avx2_tail_512_rounds_and_3xhash: |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rdi),%rdi |
| .Lseal_avx2_tail_512_rounds_and_2xhash: |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $4,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $12,%ymm15,%ymm15,%ymm15 |
| vpalignr $4,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $12,%ymm14,%ymm14,%ymm14 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vmovdqa %ymm8,0+128(%rbp) |
| vmovdqa .Lrol16(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $20,%ymm7,%ymm8 |
| vpslld $32-20,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $20,%ymm6,%ymm8 |
| vpslld $32-20,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $20,%ymm5,%ymm8 |
| vpslld $32-20,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $20,%ymm4,%ymm8 |
| vpslld $32-20,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa .Lrol8(%rip),%ymm8 |
| vpaddd %ymm7,%ymm3,%ymm3 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| addq 0+16(%rdi),%r10 |
| adcq 8+16(%rdi),%r11 |
| adcq $1,%r12 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm3,%ymm15,%ymm15 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb %ymm8,%ymm15,%ymm15 |
| vpshufb %ymm8,%ymm14,%ymm14 |
| vpshufb %ymm8,%ymm13,%ymm13 |
| vpshufb %ymm8,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm11,%ymm11 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpaddd 0+128(%rbp),%ymm12,%ymm8 |
| vpxor %ymm11,%ymm7,%ymm7 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa %ymm8,0+128(%rbp) |
| vpsrld $25,%ymm7,%ymm8 |
| movq 0+0+0(%rbp),%rdx |
| movq %rdx,%r15 |
| mulxq %r10,%r13,%r14 |
| mulxq %r11,%rax,%rdx |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| vpslld $32-25,%ymm7,%ymm7 |
| vpxor %ymm8,%ymm7,%ymm7 |
| vpsrld $25,%ymm6,%ymm8 |
| vpslld $32-25,%ymm6,%ymm6 |
| vpxor %ymm8,%ymm6,%ymm6 |
| vpsrld $25,%ymm5,%ymm8 |
| vpslld $32-25,%ymm5,%ymm5 |
| vpxor %ymm8,%ymm5,%ymm5 |
| vpsrld $25,%ymm4,%ymm8 |
| vpslld $32-25,%ymm4,%ymm4 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vmovdqa 0+128(%rbp),%ymm8 |
| vpalignr $12,%ymm7,%ymm7,%ymm7 |
| vpalignr $8,%ymm11,%ymm11,%ymm11 |
| vpalignr $4,%ymm15,%ymm15,%ymm15 |
| vpalignr $12,%ymm6,%ymm6,%ymm6 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $4,%ymm14,%ymm14,%ymm14 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| movq 8+0+0(%rbp),%rdx |
| mulxq %r10,%r10,%rax |
| addq %r10,%r14 |
| mulxq %r11,%r11,%r9 |
| adcq %r11,%r15 |
| adcq $0,%r9 |
| imulq %r12,%rdx |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| addq %rax,%r15 |
| adcq %rdx,%r9 |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 32(%rdi),%rdi |
| decq %rcx |
| jg .Lseal_avx2_tail_512_rounds_and_3xhash |
| decq %r8 |
| jge .Lseal_avx2_tail_512_rounds_and_2xhash |
| vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 |
| vpaddd 0+64(%rbp),%ymm7,%ymm7 |
| vpaddd 0+96(%rbp),%ymm11,%ymm11 |
| vpaddd 0+256(%rbp),%ymm15,%ymm15 |
| vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 |
| vpaddd 0+64(%rbp),%ymm6,%ymm6 |
| vpaddd 0+96(%rbp),%ymm10,%ymm10 |
| vpaddd 0+224(%rbp),%ymm14,%ymm14 |
| vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 |
| vpaddd 0+64(%rbp),%ymm5,%ymm5 |
| vpaddd 0+96(%rbp),%ymm9,%ymm9 |
| vpaddd 0+192(%rbp),%ymm13,%ymm13 |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd 0+64(%rbp),%ymm4,%ymm4 |
| vpaddd 0+96(%rbp),%ymm8,%ymm8 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| |
| vmovdqa %ymm0,0+128(%rbp) |
| vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 |
| vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 |
| vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 |
| vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 |
| vpxor 0+0(%rsi),%ymm0,%ymm0 |
| vpxor 32+0(%rsi),%ymm3,%ymm3 |
| vpxor 64+0(%rsi),%ymm7,%ymm7 |
| vpxor 96+0(%rsi),%ymm11,%ymm11 |
| vmovdqu %ymm0,0+0(%rdi) |
| vmovdqu %ymm3,32+0(%rdi) |
| vmovdqu %ymm7,64+0(%rdi) |
| vmovdqu %ymm11,96+0(%rdi) |
| |
| vmovdqa 0+128(%rbp),%ymm0 |
| vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 |
| vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 |
| vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 |
| vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 |
| vpxor 0+128(%rsi),%ymm3,%ymm3 |
| vpxor 32+128(%rsi),%ymm2,%ymm2 |
| vpxor 64+128(%rsi),%ymm6,%ymm6 |
| vpxor 96+128(%rsi),%ymm10,%ymm10 |
| vmovdqu %ymm3,0+128(%rdi) |
| vmovdqu %ymm2,32+128(%rdi) |
| vmovdqu %ymm6,64+128(%rdi) |
| vmovdqu %ymm10,96+128(%rdi) |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 |
| vpxor 0+256(%rsi),%ymm3,%ymm3 |
| vpxor 32+256(%rsi),%ymm1,%ymm1 |
| vpxor 64+256(%rsi),%ymm5,%ymm5 |
| vpxor 96+256(%rsi),%ymm9,%ymm9 |
| vmovdqu %ymm3,0+256(%rdi) |
| vmovdqu %ymm1,32+256(%rdi) |
| vmovdqu %ymm5,64+256(%rdi) |
| vmovdqu %ymm9,96+256(%rdi) |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 |
| vmovdqa %ymm3,%ymm8 |
| |
| movq $384,%rcx |
| leaq 384(%rsi),%rsi |
| subq $384,%rbx |
| jmp .Lseal_avx2_short_hash_remainder |
| |
| .Lseal_avx2_320: |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm8,%ymm10 |
| vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 |
| vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14 |
| vmovdqa %ymm4,%ymm7 |
| vmovdqa %ymm8,%ymm11 |
| vmovdqa %ymm12,0+160(%rbp) |
| vmovdqa %ymm13,0+192(%rbp) |
| vmovdqa %ymm14,0+224(%rbp) |
| movq $10,%r10 |
| .Lseal_avx2_320_rounds: |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol16(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpsrld $20,%ymm6,%ymm3 |
| vpslld $12,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol8(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpslld $7,%ymm6,%ymm3 |
| vpsrld $25,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpalignr $12,%ymm14,%ymm14,%ymm14 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $4,%ymm6,%ymm6,%ymm6 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol16(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpsrld $20,%ymm6,%ymm3 |
| vpslld $12,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpaddd %ymm6,%ymm2,%ymm2 |
| vpxor %ymm2,%ymm14,%ymm14 |
| vpshufb .Lrol8(%rip),%ymm14,%ymm14 |
| vpaddd %ymm14,%ymm10,%ymm10 |
| vpxor %ymm10,%ymm6,%ymm6 |
| vpslld $7,%ymm6,%ymm3 |
| vpsrld $25,%ymm6,%ymm6 |
| vpxor %ymm3,%ymm6,%ymm6 |
| vpalignr $4,%ymm14,%ymm14,%ymm14 |
| vpalignr $8,%ymm10,%ymm10,%ymm10 |
| vpalignr $12,%ymm6,%ymm6,%ymm6 |
| |
| decq %r10 |
| jne .Lseal_avx2_320_rounds |
| vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 |
| vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 |
| vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 |
| vpaddd %ymm7,%ymm4,%ymm4 |
| vpaddd %ymm7,%ymm5,%ymm5 |
| vpaddd %ymm7,%ymm6,%ymm6 |
| vpaddd %ymm11,%ymm8,%ymm8 |
| vpaddd %ymm11,%ymm9,%ymm9 |
| vpaddd %ymm11,%ymm10,%ymm10 |
| vpaddd 0+160(%rbp),%ymm12,%ymm12 |
| vpaddd 0+192(%rbp),%ymm13,%ymm13 |
| vpaddd 0+224(%rbp),%ymm14,%ymm14 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 |
| |
| vpand .Lclamp(%rip),%ymm3,%ymm3 |
| vmovdqa %ymm3,0+0(%rbp) |
| |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 |
| vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 |
| vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 |
| vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 |
| vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 |
| jmp .Lseal_avx2_short |
| |
| .Lseal_avx2_192: |
| vmovdqa %ymm0,%ymm1 |
| vmovdqa %ymm0,%ymm2 |
| vmovdqa %ymm4,%ymm5 |
| vmovdqa %ymm4,%ymm6 |
| vmovdqa %ymm8,%ymm9 |
| vmovdqa %ymm8,%ymm10 |
| vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 |
| vmovdqa %ymm12,%ymm11 |
| vmovdqa %ymm13,%ymm15 |
| movq $10,%r10 |
| .Lseal_avx2_192_rounds: |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $12,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $4,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $12,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $4,%ymm5,%ymm5,%ymm5 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol16(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpsrld $20,%ymm4,%ymm3 |
| vpslld $12,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpaddd %ymm4,%ymm0,%ymm0 |
| vpxor %ymm0,%ymm12,%ymm12 |
| vpshufb .Lrol8(%rip),%ymm12,%ymm12 |
| vpaddd %ymm12,%ymm8,%ymm8 |
| vpxor %ymm8,%ymm4,%ymm4 |
| vpslld $7,%ymm4,%ymm3 |
| vpsrld $25,%ymm4,%ymm4 |
| vpxor %ymm3,%ymm4,%ymm4 |
| vpalignr $4,%ymm12,%ymm12,%ymm12 |
| vpalignr $8,%ymm8,%ymm8,%ymm8 |
| vpalignr $12,%ymm4,%ymm4,%ymm4 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol16(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpsrld $20,%ymm5,%ymm3 |
| vpslld $12,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpaddd %ymm5,%ymm1,%ymm1 |
| vpxor %ymm1,%ymm13,%ymm13 |
| vpshufb .Lrol8(%rip),%ymm13,%ymm13 |
| vpaddd %ymm13,%ymm9,%ymm9 |
| vpxor %ymm9,%ymm5,%ymm5 |
| vpslld $7,%ymm5,%ymm3 |
| vpsrld $25,%ymm5,%ymm5 |
| vpxor %ymm3,%ymm5,%ymm5 |
| vpalignr $4,%ymm13,%ymm13,%ymm13 |
| vpalignr $8,%ymm9,%ymm9,%ymm9 |
| vpalignr $12,%ymm5,%ymm5,%ymm5 |
| |
| decq %r10 |
| jne .Lseal_avx2_192_rounds |
| vpaddd %ymm2,%ymm0,%ymm0 |
| vpaddd %ymm2,%ymm1,%ymm1 |
| vpaddd %ymm6,%ymm4,%ymm4 |
| vpaddd %ymm6,%ymm5,%ymm5 |
| vpaddd %ymm10,%ymm8,%ymm8 |
| vpaddd %ymm10,%ymm9,%ymm9 |
| vpaddd %ymm11,%ymm12,%ymm12 |
| vpaddd %ymm15,%ymm13,%ymm13 |
| vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 |
| |
| vpand .Lclamp(%rip),%ymm3,%ymm3 |
| vmovdqa %ymm3,0+0(%rbp) |
| |
| vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 |
| vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 |
| vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 |
| vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 |
| vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 |
| vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 |
| .Lseal_avx2_short: |
| movq %r8,%r8 |
| call poly_hash_ad_internal |
| xorq %rcx,%rcx |
| .Lseal_avx2_short_hash_remainder: |
| cmpq $16,%rcx |
| jb .Lseal_avx2_short_loop |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| subq $16,%rcx |
| addq $16,%rdi |
| jmp .Lseal_avx2_short_hash_remainder |
| .Lseal_avx2_short_loop: |
| cmpq $32,%rbx |
| jb .Lseal_avx2_short_tail |
| subq $32,%rbx |
| |
| vpxor (%rsi),%ymm0,%ymm0 |
| vmovdqu %ymm0,(%rdi) |
| leaq 32(%rsi),%rsi |
| |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| addq 0+16(%rdi),%r10 |
| adcq 8+16(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 32(%rdi),%rdi |
| |
| vmovdqa %ymm4,%ymm0 |
| vmovdqa %ymm8,%ymm4 |
| vmovdqa %ymm12,%ymm8 |
| vmovdqa %ymm1,%ymm12 |
| vmovdqa %ymm5,%ymm1 |
| vmovdqa %ymm9,%ymm5 |
| vmovdqa %ymm13,%ymm9 |
| vmovdqa %ymm2,%ymm13 |
| vmovdqa %ymm6,%ymm2 |
| jmp .Lseal_avx2_short_loop |
| .Lseal_avx2_short_tail: |
| cmpq $16,%rbx |
| jb .Lseal_avx2_exit |
| subq $16,%rbx |
| vpxor (%rsi),%xmm0,%xmm3 |
| vmovdqu %xmm3,(%rdi) |
| leaq 16(%rsi),%rsi |
| addq 0+0(%rdi),%r10 |
| adcq 8+0(%rdi),%r11 |
| adcq $1,%r12 |
| movq 0+0+0(%rbp),%rax |
| movq %rax,%r15 |
| mulq %r10 |
| movq %rax,%r13 |
| movq %rdx,%r14 |
| movq 0+0+0(%rbp),%rax |
| mulq %r11 |
| imulq %r12,%r15 |
| addq %rax,%r14 |
| adcq %rdx,%r15 |
| movq 8+0+0(%rbp),%rax |
| movq %rax,%r9 |
| mulq %r10 |
| addq %rax,%r14 |
| adcq $0,%rdx |
| movq %rdx,%r10 |
| movq 8+0+0(%rbp),%rax |
| mulq %r11 |
| addq %rax,%r15 |
| adcq $0,%rdx |
| imulq %r12,%r9 |
| addq %r10,%r15 |
| adcq %rdx,%r9 |
| movq %r13,%r10 |
| movq %r14,%r11 |
| movq %r15,%r12 |
| andq $3,%r12 |
| movq %r15,%r13 |
| andq $-4,%r13 |
| movq %r9,%r14 |
| shrdq $2,%r9,%r15 |
| shrq $2,%r9 |
| addq %r13,%r15 |
| adcq %r14,%r9 |
| addq %r15,%r10 |
| adcq %r9,%r11 |
| adcq $0,%r12 |
| |
| leaq 16(%rdi),%rdi |
| vextracti128 $1,%ymm0,%xmm0 |
| .Lseal_avx2_exit: |
| vzeroupper |
| jmp .Lseal_sse_tail_16 |
| .cfi_endproc |
| .size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2 |
| #endif |