blob: d76e6d0e9387b89109c980d2860d94f9edfc18c9 [file] [log] [blame]
David Benjaminfe0c91e2024-03-18 15:37:24 +10001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
7.text
8
9.section .rodata
10.align 64
11.Lzero:
12.long 0,0,0,0
13.Lone:
14.long 1,0,0,0
15.Linc:
16.long 0,1,2,3
17.Lfour:
18.long 4,4,4,4
19.Lincy:
20.long 0,2,4,6,1,3,5,7
21.Leight:
22.long 8,8,8,8,8,8,8,8
23.Lrot16:
24.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
25.Lrot24:
26.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
27.Lsigma:
28.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
29.align 64
30.Lzeroz:
31.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
32.Lfourz:
33.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
34.Lincz:
35.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
36.Lsixteen:
37.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
38.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
39.text
40.globl ChaCha20_ctr32_nohw
41.hidden ChaCha20_ctr32_nohw
42.type ChaCha20_ctr32_nohw,@function
43.align 64
44ChaCha20_ctr32_nohw:
45.cfi_startproc
46_CET_ENDBR
47 pushq %rbx
48.cfi_adjust_cfa_offset 8
49.cfi_offset rbx,-16
50 pushq %rbp
51.cfi_adjust_cfa_offset 8
52.cfi_offset rbp,-24
53 pushq %r12
54.cfi_adjust_cfa_offset 8
55.cfi_offset r12,-32
56 pushq %r13
57.cfi_adjust_cfa_offset 8
58.cfi_offset r13,-40
59 pushq %r14
60.cfi_adjust_cfa_offset 8
61.cfi_offset r14,-48
62 pushq %r15
63.cfi_adjust_cfa_offset 8
64.cfi_offset r15,-56
65 subq $64+24,%rsp
66.cfi_adjust_cfa_offset 88
67.Lctr32_body:
68
69
70 movdqu (%rcx),%xmm1
71 movdqu 16(%rcx),%xmm2
72 movdqu (%r8),%xmm3
73 movdqa .Lone(%rip),%xmm4
74
75
76 movdqa %xmm1,16(%rsp)
77 movdqa %xmm2,32(%rsp)
78 movdqa %xmm3,48(%rsp)
79 movq %rdx,%rbp
80 jmp .Loop_outer
81
82.align 32
83.Loop_outer:
84 movl $0x61707865,%eax
85 movl $0x3320646e,%ebx
86 movl $0x79622d32,%ecx
87 movl $0x6b206574,%edx
88 movl 16(%rsp),%r8d
89 movl 20(%rsp),%r9d
90 movl 24(%rsp),%r10d
91 movl 28(%rsp),%r11d
92 movd %xmm3,%r12d
93 movl 52(%rsp),%r13d
94 movl 56(%rsp),%r14d
95 movl 60(%rsp),%r15d
96
97 movq %rbp,64+0(%rsp)
98 movl $10,%ebp
99 movq %rsi,64+8(%rsp)
100.byte 102,72,15,126,214
101 movq %rdi,64+16(%rsp)
102 movq %rsi,%rdi
103 shrq $32,%rdi
104 jmp .Loop
105
106.align 32
107.Loop:
108 addl %r8d,%eax
109 xorl %eax,%r12d
110 roll $16,%r12d
111 addl %r9d,%ebx
112 xorl %ebx,%r13d
113 roll $16,%r13d
114 addl %r12d,%esi
115 xorl %esi,%r8d
116 roll $12,%r8d
117 addl %r13d,%edi
118 xorl %edi,%r9d
119 roll $12,%r9d
120 addl %r8d,%eax
121 xorl %eax,%r12d
122 roll $8,%r12d
123 addl %r9d,%ebx
124 xorl %ebx,%r13d
125 roll $8,%r13d
126 addl %r12d,%esi
127 xorl %esi,%r8d
128 roll $7,%r8d
129 addl %r13d,%edi
130 xorl %edi,%r9d
131 roll $7,%r9d
132 movl %esi,32(%rsp)
133 movl %edi,36(%rsp)
134 movl 40(%rsp),%esi
135 movl 44(%rsp),%edi
136 addl %r10d,%ecx
137 xorl %ecx,%r14d
138 roll $16,%r14d
139 addl %r11d,%edx
140 xorl %edx,%r15d
141 roll $16,%r15d
142 addl %r14d,%esi
143 xorl %esi,%r10d
144 roll $12,%r10d
145 addl %r15d,%edi
146 xorl %edi,%r11d
147 roll $12,%r11d
148 addl %r10d,%ecx
149 xorl %ecx,%r14d
150 roll $8,%r14d
151 addl %r11d,%edx
152 xorl %edx,%r15d
153 roll $8,%r15d
154 addl %r14d,%esi
155 xorl %esi,%r10d
156 roll $7,%r10d
157 addl %r15d,%edi
158 xorl %edi,%r11d
159 roll $7,%r11d
160 addl %r9d,%eax
161 xorl %eax,%r15d
162 roll $16,%r15d
163 addl %r10d,%ebx
164 xorl %ebx,%r12d
165 roll $16,%r12d
166 addl %r15d,%esi
167 xorl %esi,%r9d
168 roll $12,%r9d
169 addl %r12d,%edi
170 xorl %edi,%r10d
171 roll $12,%r10d
172 addl %r9d,%eax
173 xorl %eax,%r15d
174 roll $8,%r15d
175 addl %r10d,%ebx
176 xorl %ebx,%r12d
177 roll $8,%r12d
178 addl %r15d,%esi
179 xorl %esi,%r9d
180 roll $7,%r9d
181 addl %r12d,%edi
182 xorl %edi,%r10d
183 roll $7,%r10d
184 movl %esi,40(%rsp)
185 movl %edi,44(%rsp)
186 movl 32(%rsp),%esi
187 movl 36(%rsp),%edi
188 addl %r11d,%ecx
189 xorl %ecx,%r13d
190 roll $16,%r13d
191 addl %r8d,%edx
192 xorl %edx,%r14d
193 roll $16,%r14d
194 addl %r13d,%esi
195 xorl %esi,%r11d
196 roll $12,%r11d
197 addl %r14d,%edi
198 xorl %edi,%r8d
199 roll $12,%r8d
200 addl %r11d,%ecx
201 xorl %ecx,%r13d
202 roll $8,%r13d
203 addl %r8d,%edx
204 xorl %edx,%r14d
205 roll $8,%r14d
206 addl %r13d,%esi
207 xorl %esi,%r11d
208 roll $7,%r11d
209 addl %r14d,%edi
210 xorl %edi,%r8d
211 roll $7,%r8d
212 decl %ebp
213 jnz .Loop
214 movl %edi,36(%rsp)
215 movl %esi,32(%rsp)
216 movq 64(%rsp),%rbp
217 movdqa %xmm2,%xmm1
218 movq 64+8(%rsp),%rsi
219 paddd %xmm4,%xmm3
220 movq 64+16(%rsp),%rdi
221
222 addl $0x61707865,%eax
223 addl $0x3320646e,%ebx
224 addl $0x79622d32,%ecx
225 addl $0x6b206574,%edx
226 addl 16(%rsp),%r8d
227 addl 20(%rsp),%r9d
228 addl 24(%rsp),%r10d
229 addl 28(%rsp),%r11d
230 addl 48(%rsp),%r12d
231 addl 52(%rsp),%r13d
232 addl 56(%rsp),%r14d
233 addl 60(%rsp),%r15d
234 paddd 32(%rsp),%xmm1
235
236 cmpq $64,%rbp
237 jb .Ltail
238
239 xorl 0(%rsi),%eax
240 xorl 4(%rsi),%ebx
241 xorl 8(%rsi),%ecx
242 xorl 12(%rsi),%edx
243 xorl 16(%rsi),%r8d
244 xorl 20(%rsi),%r9d
245 xorl 24(%rsi),%r10d
246 xorl 28(%rsi),%r11d
247 movdqu 32(%rsi),%xmm0
248 xorl 48(%rsi),%r12d
249 xorl 52(%rsi),%r13d
250 xorl 56(%rsi),%r14d
251 xorl 60(%rsi),%r15d
252 leaq 64(%rsi),%rsi
253 pxor %xmm1,%xmm0
254
255 movdqa %xmm2,32(%rsp)
256 movd %xmm3,48(%rsp)
257
258 movl %eax,0(%rdi)
259 movl %ebx,4(%rdi)
260 movl %ecx,8(%rdi)
261 movl %edx,12(%rdi)
262 movl %r8d,16(%rdi)
263 movl %r9d,20(%rdi)
264 movl %r10d,24(%rdi)
265 movl %r11d,28(%rdi)
266 movdqu %xmm0,32(%rdi)
267 movl %r12d,48(%rdi)
268 movl %r13d,52(%rdi)
269 movl %r14d,56(%rdi)
270 movl %r15d,60(%rdi)
271 leaq 64(%rdi),%rdi
272
273 subq $64,%rbp
274 jnz .Loop_outer
275
276 jmp .Ldone
277
278.align 16
279.Ltail:
280 movl %eax,0(%rsp)
281 movl %ebx,4(%rsp)
282 xorq %rbx,%rbx
283 movl %ecx,8(%rsp)
284 movl %edx,12(%rsp)
285 movl %r8d,16(%rsp)
286 movl %r9d,20(%rsp)
287 movl %r10d,24(%rsp)
288 movl %r11d,28(%rsp)
289 movdqa %xmm1,32(%rsp)
290 movl %r12d,48(%rsp)
291 movl %r13d,52(%rsp)
292 movl %r14d,56(%rsp)
293 movl %r15d,60(%rsp)
294
295.Loop_tail:
296 movzbl (%rsi,%rbx,1),%eax
297 movzbl (%rsp,%rbx,1),%edx
298 leaq 1(%rbx),%rbx
299 xorl %edx,%eax
300 movb %al,-1(%rdi,%rbx,1)
301 decq %rbp
302 jnz .Loop_tail
303
304.Ldone:
305 leaq 64+24+48(%rsp),%rsi
306 movq -48(%rsi),%r15
307.cfi_restore r15
308 movq -40(%rsi),%r14
309.cfi_restore r14
310 movq -32(%rsi),%r13
311.cfi_restore r13
312 movq -24(%rsi),%r12
313.cfi_restore r12
314 movq -16(%rsi),%rbp
315.cfi_restore rbp
316 movq -8(%rsi),%rbx
317.cfi_restore rbx
318 leaq (%rsi),%rsp
319.cfi_adjust_cfa_offset -136
320.Lno_data:
321 ret
322.cfi_endproc
323.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
324.globl ChaCha20_ctr32_ssse3
325.hidden ChaCha20_ctr32_ssse3
326.type ChaCha20_ctr32_ssse3,@function
327.align 32
328ChaCha20_ctr32_ssse3:
329.cfi_startproc
330_CET_ENDBR
331 movq %rsp,%r9
332.cfi_def_cfa_register r9
333 subq $64+8,%rsp
334 movdqa .Lsigma(%rip),%xmm0
335 movdqu (%rcx),%xmm1
336 movdqu 16(%rcx),%xmm2
337 movdqu (%r8),%xmm3
338 movdqa .Lrot16(%rip),%xmm6
339 movdqa .Lrot24(%rip),%xmm7
340
341 movdqa %xmm0,0(%rsp)
342 movdqa %xmm1,16(%rsp)
343 movdqa %xmm2,32(%rsp)
344 movdqa %xmm3,48(%rsp)
345 movq $10,%r8
346 jmp .Loop_ssse3
347
348.align 32
349.Loop_outer_ssse3:
350 movdqa .Lone(%rip),%xmm3
351 movdqa 0(%rsp),%xmm0
352 movdqa 16(%rsp),%xmm1
353 movdqa 32(%rsp),%xmm2
354 paddd 48(%rsp),%xmm3
355 movq $10,%r8
356 movdqa %xmm3,48(%rsp)
357 jmp .Loop_ssse3
358
359.align 32
360.Loop_ssse3:
361 paddd %xmm1,%xmm0
362 pxor %xmm0,%xmm3
363.byte 102,15,56,0,222
364 paddd %xmm3,%xmm2
365 pxor %xmm2,%xmm1
366 movdqa %xmm1,%xmm4
367 psrld $20,%xmm1
368 pslld $12,%xmm4
369 por %xmm4,%xmm1
370 paddd %xmm1,%xmm0
371 pxor %xmm0,%xmm3
372.byte 102,15,56,0,223
373 paddd %xmm3,%xmm2
374 pxor %xmm2,%xmm1
375 movdqa %xmm1,%xmm4
376 psrld $25,%xmm1
377 pslld $7,%xmm4
378 por %xmm4,%xmm1
379 pshufd $78,%xmm2,%xmm2
380 pshufd $57,%xmm1,%xmm1
381 pshufd $147,%xmm3,%xmm3
382 nop
383 paddd %xmm1,%xmm0
384 pxor %xmm0,%xmm3
385.byte 102,15,56,0,222
386 paddd %xmm3,%xmm2
387 pxor %xmm2,%xmm1
388 movdqa %xmm1,%xmm4
389 psrld $20,%xmm1
390 pslld $12,%xmm4
391 por %xmm4,%xmm1
392 paddd %xmm1,%xmm0
393 pxor %xmm0,%xmm3
394.byte 102,15,56,0,223
395 paddd %xmm3,%xmm2
396 pxor %xmm2,%xmm1
397 movdqa %xmm1,%xmm4
398 psrld $25,%xmm1
399 pslld $7,%xmm4
400 por %xmm4,%xmm1
401 pshufd $78,%xmm2,%xmm2
402 pshufd $147,%xmm1,%xmm1
403 pshufd $57,%xmm3,%xmm3
404 decq %r8
405 jnz .Loop_ssse3
406 paddd 0(%rsp),%xmm0
407 paddd 16(%rsp),%xmm1
408 paddd 32(%rsp),%xmm2
409 paddd 48(%rsp),%xmm3
410
411 cmpq $64,%rdx
412 jb .Ltail_ssse3
413
414 movdqu 0(%rsi),%xmm4
415 movdqu 16(%rsi),%xmm5
416 pxor %xmm4,%xmm0
417 movdqu 32(%rsi),%xmm4
418 pxor %xmm5,%xmm1
419 movdqu 48(%rsi),%xmm5
420 leaq 64(%rsi),%rsi
421 pxor %xmm4,%xmm2
422 pxor %xmm5,%xmm3
423
424 movdqu %xmm0,0(%rdi)
425 movdqu %xmm1,16(%rdi)
426 movdqu %xmm2,32(%rdi)
427 movdqu %xmm3,48(%rdi)
428 leaq 64(%rdi),%rdi
429
430 subq $64,%rdx
431 jnz .Loop_outer_ssse3
432
433 jmp .Ldone_ssse3
434
435.align 16
436.Ltail_ssse3:
437 movdqa %xmm0,0(%rsp)
438 movdqa %xmm1,16(%rsp)
439 movdqa %xmm2,32(%rsp)
440 movdqa %xmm3,48(%rsp)
441 xorq %r8,%r8
442
443.Loop_tail_ssse3:
444 movzbl (%rsi,%r8,1),%eax
445 movzbl (%rsp,%r8,1),%ecx
446 leaq 1(%r8),%r8
447 xorl %ecx,%eax
448 movb %al,-1(%rdi,%r8,1)
449 decq %rdx
450 jnz .Loop_tail_ssse3
451
452.Ldone_ssse3:
453 leaq (%r9),%rsp
454.cfi_def_cfa_register rsp
455.Lssse3_epilogue:
456 ret
457.cfi_endproc
458.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3
459.globl ChaCha20_ctr32_ssse3_4x
460.hidden ChaCha20_ctr32_ssse3_4x
461.type ChaCha20_ctr32_ssse3_4x,@function
462.align 32
463ChaCha20_ctr32_ssse3_4x:
464.cfi_startproc
465_CET_ENDBR
466 movq %rsp,%r9
467.cfi_def_cfa_register r9
David Benjaminfe0c91e2024-03-18 15:37:24 +1000468 subq $0x140+8,%rsp
469 movdqa .Lsigma(%rip),%xmm11
470 movdqu (%rcx),%xmm15
471 movdqu 16(%rcx),%xmm7
472 movdqu (%r8),%xmm3
473 leaq 256(%rsp),%rcx
474 leaq .Lrot16(%rip),%r10
475 leaq .Lrot24(%rip),%r11
476
477 pshufd $0x00,%xmm11,%xmm8
478 pshufd $0x55,%xmm11,%xmm9
479 movdqa %xmm8,64(%rsp)
480 pshufd $0xaa,%xmm11,%xmm10
481 movdqa %xmm9,80(%rsp)
482 pshufd $0xff,%xmm11,%xmm11
483 movdqa %xmm10,96(%rsp)
484 movdqa %xmm11,112(%rsp)
485
486 pshufd $0x00,%xmm15,%xmm12
487 pshufd $0x55,%xmm15,%xmm13
488 movdqa %xmm12,128-256(%rcx)
489 pshufd $0xaa,%xmm15,%xmm14
490 movdqa %xmm13,144-256(%rcx)
491 pshufd $0xff,%xmm15,%xmm15
492 movdqa %xmm14,160-256(%rcx)
493 movdqa %xmm15,176-256(%rcx)
494
495 pshufd $0x00,%xmm7,%xmm4
496 pshufd $0x55,%xmm7,%xmm5
497 movdqa %xmm4,192-256(%rcx)
498 pshufd $0xaa,%xmm7,%xmm6
499 movdqa %xmm5,208-256(%rcx)
500 pshufd $0xff,%xmm7,%xmm7
501 movdqa %xmm6,224-256(%rcx)
502 movdqa %xmm7,240-256(%rcx)
503
504 pshufd $0x00,%xmm3,%xmm0
505 pshufd $0x55,%xmm3,%xmm1
506 paddd .Linc(%rip),%xmm0
507 pshufd $0xaa,%xmm3,%xmm2
508 movdqa %xmm1,272-256(%rcx)
509 pshufd $0xff,%xmm3,%xmm3
510 movdqa %xmm2,288-256(%rcx)
511 movdqa %xmm3,304-256(%rcx)
512
513 jmp .Loop_enter4x
514
515.align 32
516.Loop_outer4x:
517 movdqa 64(%rsp),%xmm8
518 movdqa 80(%rsp),%xmm9
519 movdqa 96(%rsp),%xmm10
520 movdqa 112(%rsp),%xmm11
521 movdqa 128-256(%rcx),%xmm12
522 movdqa 144-256(%rcx),%xmm13
523 movdqa 160-256(%rcx),%xmm14
524 movdqa 176-256(%rcx),%xmm15
525 movdqa 192-256(%rcx),%xmm4
526 movdqa 208-256(%rcx),%xmm5
527 movdqa 224-256(%rcx),%xmm6
528 movdqa 240-256(%rcx),%xmm7
529 movdqa 256-256(%rcx),%xmm0
530 movdqa 272-256(%rcx),%xmm1
531 movdqa 288-256(%rcx),%xmm2
532 movdqa 304-256(%rcx),%xmm3
533 paddd .Lfour(%rip),%xmm0
534
535.Loop_enter4x:
536 movdqa %xmm6,32(%rsp)
537 movdqa %xmm7,48(%rsp)
538 movdqa (%r10),%xmm7
539 movl $10,%eax
540 movdqa %xmm0,256-256(%rcx)
541 jmp .Loop4x
542
543.align 32
544.Loop4x:
545 paddd %xmm12,%xmm8
546 paddd %xmm13,%xmm9
547 pxor %xmm8,%xmm0
548 pxor %xmm9,%xmm1
549.byte 102,15,56,0,199
550.byte 102,15,56,0,207
551 paddd %xmm0,%xmm4
552 paddd %xmm1,%xmm5
553 pxor %xmm4,%xmm12
554 pxor %xmm5,%xmm13
555 movdqa %xmm12,%xmm6
556 pslld $12,%xmm12
557 psrld $20,%xmm6
558 movdqa %xmm13,%xmm7
559 pslld $12,%xmm13
560 por %xmm6,%xmm12
561 psrld $20,%xmm7
562 movdqa (%r11),%xmm6
563 por %xmm7,%xmm13
564 paddd %xmm12,%xmm8
565 paddd %xmm13,%xmm9
566 pxor %xmm8,%xmm0
567 pxor %xmm9,%xmm1
568.byte 102,15,56,0,198
569.byte 102,15,56,0,206
570 paddd %xmm0,%xmm4
571 paddd %xmm1,%xmm5
572 pxor %xmm4,%xmm12
573 pxor %xmm5,%xmm13
574 movdqa %xmm12,%xmm7
575 pslld $7,%xmm12
576 psrld $25,%xmm7
577 movdqa %xmm13,%xmm6
578 pslld $7,%xmm13
579 por %xmm7,%xmm12
580 psrld $25,%xmm6
581 movdqa (%r10),%xmm7
582 por %xmm6,%xmm13
583 movdqa %xmm4,0(%rsp)
584 movdqa %xmm5,16(%rsp)
585 movdqa 32(%rsp),%xmm4
586 movdqa 48(%rsp),%xmm5
587 paddd %xmm14,%xmm10
588 paddd %xmm15,%xmm11
589 pxor %xmm10,%xmm2
590 pxor %xmm11,%xmm3
591.byte 102,15,56,0,215
592.byte 102,15,56,0,223
593 paddd %xmm2,%xmm4
594 paddd %xmm3,%xmm5
595 pxor %xmm4,%xmm14
596 pxor %xmm5,%xmm15
597 movdqa %xmm14,%xmm6
598 pslld $12,%xmm14
599 psrld $20,%xmm6
600 movdqa %xmm15,%xmm7
601 pslld $12,%xmm15
602 por %xmm6,%xmm14
603 psrld $20,%xmm7
604 movdqa (%r11),%xmm6
605 por %xmm7,%xmm15
606 paddd %xmm14,%xmm10
607 paddd %xmm15,%xmm11
608 pxor %xmm10,%xmm2
609 pxor %xmm11,%xmm3
610.byte 102,15,56,0,214
611.byte 102,15,56,0,222
612 paddd %xmm2,%xmm4
613 paddd %xmm3,%xmm5
614 pxor %xmm4,%xmm14
615 pxor %xmm5,%xmm15
616 movdqa %xmm14,%xmm7
617 pslld $7,%xmm14
618 psrld $25,%xmm7
619 movdqa %xmm15,%xmm6
620 pslld $7,%xmm15
621 por %xmm7,%xmm14
622 psrld $25,%xmm6
623 movdqa (%r10),%xmm7
624 por %xmm6,%xmm15
625 paddd %xmm13,%xmm8
626 paddd %xmm14,%xmm9
627 pxor %xmm8,%xmm3
628 pxor %xmm9,%xmm0
629.byte 102,15,56,0,223
630.byte 102,15,56,0,199
631 paddd %xmm3,%xmm4
632 paddd %xmm0,%xmm5
633 pxor %xmm4,%xmm13
634 pxor %xmm5,%xmm14
635 movdqa %xmm13,%xmm6
636 pslld $12,%xmm13
637 psrld $20,%xmm6
638 movdqa %xmm14,%xmm7
639 pslld $12,%xmm14
640 por %xmm6,%xmm13
641 psrld $20,%xmm7
642 movdqa (%r11),%xmm6
643 por %xmm7,%xmm14
644 paddd %xmm13,%xmm8
645 paddd %xmm14,%xmm9
646 pxor %xmm8,%xmm3
647 pxor %xmm9,%xmm0
648.byte 102,15,56,0,222
649.byte 102,15,56,0,198
650 paddd %xmm3,%xmm4
651 paddd %xmm0,%xmm5
652 pxor %xmm4,%xmm13
653 pxor %xmm5,%xmm14
654 movdqa %xmm13,%xmm7
655 pslld $7,%xmm13
656 psrld $25,%xmm7
657 movdqa %xmm14,%xmm6
658 pslld $7,%xmm14
659 por %xmm7,%xmm13
660 psrld $25,%xmm6
661 movdqa (%r10),%xmm7
662 por %xmm6,%xmm14
663 movdqa %xmm4,32(%rsp)
664 movdqa %xmm5,48(%rsp)
665 movdqa 0(%rsp),%xmm4
666 movdqa 16(%rsp),%xmm5
667 paddd %xmm15,%xmm10
668 paddd %xmm12,%xmm11
669 pxor %xmm10,%xmm1
670 pxor %xmm11,%xmm2
671.byte 102,15,56,0,207
672.byte 102,15,56,0,215
673 paddd %xmm1,%xmm4
674 paddd %xmm2,%xmm5
675 pxor %xmm4,%xmm15
676 pxor %xmm5,%xmm12
677 movdqa %xmm15,%xmm6
678 pslld $12,%xmm15
679 psrld $20,%xmm6
680 movdqa %xmm12,%xmm7
681 pslld $12,%xmm12
682 por %xmm6,%xmm15
683 psrld $20,%xmm7
684 movdqa (%r11),%xmm6
685 por %xmm7,%xmm12
686 paddd %xmm15,%xmm10
687 paddd %xmm12,%xmm11
688 pxor %xmm10,%xmm1
689 pxor %xmm11,%xmm2
690.byte 102,15,56,0,206
691.byte 102,15,56,0,214
692 paddd %xmm1,%xmm4
693 paddd %xmm2,%xmm5
694 pxor %xmm4,%xmm15
695 pxor %xmm5,%xmm12
696 movdqa %xmm15,%xmm7
697 pslld $7,%xmm15
698 psrld $25,%xmm7
699 movdqa %xmm12,%xmm6
700 pslld $7,%xmm12
701 por %xmm7,%xmm15
702 psrld $25,%xmm6
703 movdqa (%r10),%xmm7
704 por %xmm6,%xmm12
705 decl %eax
706 jnz .Loop4x
707
708 paddd 64(%rsp),%xmm8
709 paddd 80(%rsp),%xmm9
710 paddd 96(%rsp),%xmm10
711 paddd 112(%rsp),%xmm11
712
713 movdqa %xmm8,%xmm6
714 punpckldq %xmm9,%xmm8
715 movdqa %xmm10,%xmm7
716 punpckldq %xmm11,%xmm10
717 punpckhdq %xmm9,%xmm6
718 punpckhdq %xmm11,%xmm7
719 movdqa %xmm8,%xmm9
720 punpcklqdq %xmm10,%xmm8
721 movdqa %xmm6,%xmm11
722 punpcklqdq %xmm7,%xmm6
723 punpckhqdq %xmm10,%xmm9
724 punpckhqdq %xmm7,%xmm11
725 paddd 128-256(%rcx),%xmm12
726 paddd 144-256(%rcx),%xmm13
727 paddd 160-256(%rcx),%xmm14
728 paddd 176-256(%rcx),%xmm15
729
730 movdqa %xmm8,0(%rsp)
731 movdqa %xmm9,16(%rsp)
732 movdqa 32(%rsp),%xmm8
733 movdqa 48(%rsp),%xmm9
734
735 movdqa %xmm12,%xmm10
736 punpckldq %xmm13,%xmm12
737 movdqa %xmm14,%xmm7
738 punpckldq %xmm15,%xmm14
739 punpckhdq %xmm13,%xmm10
740 punpckhdq %xmm15,%xmm7
741 movdqa %xmm12,%xmm13
742 punpcklqdq %xmm14,%xmm12
743 movdqa %xmm10,%xmm15
744 punpcklqdq %xmm7,%xmm10
745 punpckhqdq %xmm14,%xmm13
746 punpckhqdq %xmm7,%xmm15
747 paddd 192-256(%rcx),%xmm4
748 paddd 208-256(%rcx),%xmm5
749 paddd 224-256(%rcx),%xmm8
750 paddd 240-256(%rcx),%xmm9
751
752 movdqa %xmm6,32(%rsp)
753 movdqa %xmm11,48(%rsp)
754
755 movdqa %xmm4,%xmm14
756 punpckldq %xmm5,%xmm4
757 movdqa %xmm8,%xmm7
758 punpckldq %xmm9,%xmm8
759 punpckhdq %xmm5,%xmm14
760 punpckhdq %xmm9,%xmm7
761 movdqa %xmm4,%xmm5
762 punpcklqdq %xmm8,%xmm4
763 movdqa %xmm14,%xmm9
764 punpcklqdq %xmm7,%xmm14
765 punpckhqdq %xmm8,%xmm5
766 punpckhqdq %xmm7,%xmm9
767 paddd 256-256(%rcx),%xmm0
768 paddd 272-256(%rcx),%xmm1
769 paddd 288-256(%rcx),%xmm2
770 paddd 304-256(%rcx),%xmm3
771
772 movdqa %xmm0,%xmm8
773 punpckldq %xmm1,%xmm0
774 movdqa %xmm2,%xmm7
775 punpckldq %xmm3,%xmm2
776 punpckhdq %xmm1,%xmm8
777 punpckhdq %xmm3,%xmm7
778 movdqa %xmm0,%xmm1
779 punpcklqdq %xmm2,%xmm0
780 movdqa %xmm8,%xmm3
781 punpcklqdq %xmm7,%xmm8
782 punpckhqdq %xmm2,%xmm1
783 punpckhqdq %xmm7,%xmm3
784 cmpq $256,%rdx
785 jb .Ltail4x
786
787 movdqu 0(%rsi),%xmm6
788 movdqu 16(%rsi),%xmm11
789 movdqu 32(%rsi),%xmm2
790 movdqu 48(%rsi),%xmm7
791 pxor 0(%rsp),%xmm6
792 pxor %xmm12,%xmm11
793 pxor %xmm4,%xmm2
794 pxor %xmm0,%xmm7
795
796 movdqu %xmm6,0(%rdi)
797 movdqu 64(%rsi),%xmm6
798 movdqu %xmm11,16(%rdi)
799 movdqu 80(%rsi),%xmm11
800 movdqu %xmm2,32(%rdi)
801 movdqu 96(%rsi),%xmm2
802 movdqu %xmm7,48(%rdi)
803 movdqu 112(%rsi),%xmm7
804 leaq 128(%rsi),%rsi
805 pxor 16(%rsp),%xmm6
806 pxor %xmm13,%xmm11
807 pxor %xmm5,%xmm2
808 pxor %xmm1,%xmm7
809
810 movdqu %xmm6,64(%rdi)
811 movdqu 0(%rsi),%xmm6
812 movdqu %xmm11,80(%rdi)
813 movdqu 16(%rsi),%xmm11
814 movdqu %xmm2,96(%rdi)
815 movdqu 32(%rsi),%xmm2
816 movdqu %xmm7,112(%rdi)
817 leaq 128(%rdi),%rdi
818 movdqu 48(%rsi),%xmm7
819 pxor 32(%rsp),%xmm6
820 pxor %xmm10,%xmm11
821 pxor %xmm14,%xmm2
822 pxor %xmm8,%xmm7
823
824 movdqu %xmm6,0(%rdi)
825 movdqu 64(%rsi),%xmm6
826 movdqu %xmm11,16(%rdi)
827 movdqu 80(%rsi),%xmm11
828 movdqu %xmm2,32(%rdi)
829 movdqu 96(%rsi),%xmm2
830 movdqu %xmm7,48(%rdi)
831 movdqu 112(%rsi),%xmm7
832 leaq 128(%rsi),%rsi
833 pxor 48(%rsp),%xmm6
834 pxor %xmm15,%xmm11
835 pxor %xmm9,%xmm2
836 pxor %xmm3,%xmm7
837 movdqu %xmm6,64(%rdi)
838 movdqu %xmm11,80(%rdi)
839 movdqu %xmm2,96(%rdi)
840 movdqu %xmm7,112(%rdi)
841 leaq 128(%rdi),%rdi
842
843 subq $256,%rdx
844 jnz .Loop_outer4x
845
846 jmp .Ldone4x
847
848.Ltail4x:
849 cmpq $192,%rdx
850 jae .L192_or_more4x
851 cmpq $128,%rdx
852 jae .L128_or_more4x
853 cmpq $64,%rdx
854 jae .L64_or_more4x
855
856
857 xorq %r10,%r10
858
859 movdqa %xmm12,16(%rsp)
860 movdqa %xmm4,32(%rsp)
861 movdqa %xmm0,48(%rsp)
862 jmp .Loop_tail4x
863
864.align 32
865.L64_or_more4x:
866 movdqu 0(%rsi),%xmm6
867 movdqu 16(%rsi),%xmm11
868 movdqu 32(%rsi),%xmm2
869 movdqu 48(%rsi),%xmm7
870 pxor 0(%rsp),%xmm6
871 pxor %xmm12,%xmm11
872 pxor %xmm4,%xmm2
873 pxor %xmm0,%xmm7
874 movdqu %xmm6,0(%rdi)
875 movdqu %xmm11,16(%rdi)
876 movdqu %xmm2,32(%rdi)
877 movdqu %xmm7,48(%rdi)
878 je .Ldone4x
879
880 movdqa 16(%rsp),%xmm6
881 leaq 64(%rsi),%rsi
882 xorq %r10,%r10
883 movdqa %xmm6,0(%rsp)
884 movdqa %xmm13,16(%rsp)
885 leaq 64(%rdi),%rdi
886 movdqa %xmm5,32(%rsp)
887 subq $64,%rdx
888 movdqa %xmm1,48(%rsp)
889 jmp .Loop_tail4x
890
891.align 32
892.L128_or_more4x:
893 movdqu 0(%rsi),%xmm6
894 movdqu 16(%rsi),%xmm11
895 movdqu 32(%rsi),%xmm2
896 movdqu 48(%rsi),%xmm7
897 pxor 0(%rsp),%xmm6
898 pxor %xmm12,%xmm11
899 pxor %xmm4,%xmm2
900 pxor %xmm0,%xmm7
901
902 movdqu %xmm6,0(%rdi)
903 movdqu 64(%rsi),%xmm6
904 movdqu %xmm11,16(%rdi)
905 movdqu 80(%rsi),%xmm11
906 movdqu %xmm2,32(%rdi)
907 movdqu 96(%rsi),%xmm2
908 movdqu %xmm7,48(%rdi)
909 movdqu 112(%rsi),%xmm7
910 pxor 16(%rsp),%xmm6
911 pxor %xmm13,%xmm11
912 pxor %xmm5,%xmm2
913 pxor %xmm1,%xmm7
914 movdqu %xmm6,64(%rdi)
915 movdqu %xmm11,80(%rdi)
916 movdqu %xmm2,96(%rdi)
917 movdqu %xmm7,112(%rdi)
918 je .Ldone4x
919
920 movdqa 32(%rsp),%xmm6
921 leaq 128(%rsi),%rsi
922 xorq %r10,%r10
923 movdqa %xmm6,0(%rsp)
924 movdqa %xmm10,16(%rsp)
925 leaq 128(%rdi),%rdi
926 movdqa %xmm14,32(%rsp)
927 subq $128,%rdx
928 movdqa %xmm8,48(%rsp)
929 jmp .Loop_tail4x
930
931.align 32
932.L192_or_more4x:
933 movdqu 0(%rsi),%xmm6
934 movdqu 16(%rsi),%xmm11
935 movdqu 32(%rsi),%xmm2
936 movdqu 48(%rsi),%xmm7
937 pxor 0(%rsp),%xmm6
938 pxor %xmm12,%xmm11
939 pxor %xmm4,%xmm2
940 pxor %xmm0,%xmm7
941
942 movdqu %xmm6,0(%rdi)
943 movdqu 64(%rsi),%xmm6
944 movdqu %xmm11,16(%rdi)
945 movdqu 80(%rsi),%xmm11
946 movdqu %xmm2,32(%rdi)
947 movdqu 96(%rsi),%xmm2
948 movdqu %xmm7,48(%rdi)
949 movdqu 112(%rsi),%xmm7
950 leaq 128(%rsi),%rsi
951 pxor 16(%rsp),%xmm6
952 pxor %xmm13,%xmm11
953 pxor %xmm5,%xmm2
954 pxor %xmm1,%xmm7
955
956 movdqu %xmm6,64(%rdi)
957 movdqu 0(%rsi),%xmm6
958 movdqu %xmm11,80(%rdi)
959 movdqu 16(%rsi),%xmm11
960 movdqu %xmm2,96(%rdi)
961 movdqu 32(%rsi),%xmm2
962 movdqu %xmm7,112(%rdi)
963 leaq 128(%rdi),%rdi
964 movdqu 48(%rsi),%xmm7
965 pxor 32(%rsp),%xmm6
966 pxor %xmm10,%xmm11
967 pxor %xmm14,%xmm2
968 pxor %xmm8,%xmm7
969 movdqu %xmm6,0(%rdi)
970 movdqu %xmm11,16(%rdi)
971 movdqu %xmm2,32(%rdi)
972 movdqu %xmm7,48(%rdi)
973 je .Ldone4x
974
975 movdqa 48(%rsp),%xmm6
976 leaq 64(%rsi),%rsi
977 xorq %r10,%r10
978 movdqa %xmm6,0(%rsp)
979 movdqa %xmm15,16(%rsp)
980 leaq 64(%rdi),%rdi
981 movdqa %xmm9,32(%rsp)
982 subq $192,%rdx
983 movdqa %xmm3,48(%rsp)
984
985.Loop_tail4x:
986 movzbl (%rsi,%r10,1),%eax
987 movzbl (%rsp,%r10,1),%ecx
988 leaq 1(%r10),%r10
989 xorl %ecx,%eax
990 movb %al,-1(%rdi,%r10,1)
991 decq %rdx
992 jnz .Loop_tail4x
993
994.Ldone4x:
995 leaq (%r9),%rsp
996.cfi_def_cfa_register rsp
997.L4x_epilogue:
998 ret
999.cfi_endproc
1000.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
1001.globl ChaCha20_ctr32_avx2
1002.hidden ChaCha20_ctr32_avx2
1003.type ChaCha20_ctr32_avx2,@function
1004.align 32
1005ChaCha20_ctr32_avx2:
1006.cfi_startproc
1007_CET_ENDBR
1008 movq %rsp,%r9
1009.cfi_def_cfa_register r9
1010 subq $0x280+8,%rsp
1011 andq $-32,%rsp
1012 vzeroupper
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023 vbroadcasti128 .Lsigma(%rip),%ymm11
1024 vbroadcasti128 (%rcx),%ymm3
1025 vbroadcasti128 16(%rcx),%ymm15
1026 vbroadcasti128 (%r8),%ymm7
1027 leaq 256(%rsp),%rcx
1028 leaq 512(%rsp),%rax
1029 leaq .Lrot16(%rip),%r10
1030 leaq .Lrot24(%rip),%r11
1031
1032 vpshufd $0x00,%ymm11,%ymm8
1033 vpshufd $0x55,%ymm11,%ymm9
1034 vmovdqa %ymm8,128-256(%rcx)
1035 vpshufd $0xaa,%ymm11,%ymm10
1036 vmovdqa %ymm9,160-256(%rcx)
1037 vpshufd $0xff,%ymm11,%ymm11
1038 vmovdqa %ymm10,192-256(%rcx)
1039 vmovdqa %ymm11,224-256(%rcx)
1040
1041 vpshufd $0x00,%ymm3,%ymm0
1042 vpshufd $0x55,%ymm3,%ymm1
1043 vmovdqa %ymm0,256-256(%rcx)
1044 vpshufd $0xaa,%ymm3,%ymm2
1045 vmovdqa %ymm1,288-256(%rcx)
1046 vpshufd $0xff,%ymm3,%ymm3
1047 vmovdqa %ymm2,320-256(%rcx)
1048 vmovdqa %ymm3,352-256(%rcx)
1049
1050 vpshufd $0x00,%ymm15,%ymm12
1051 vpshufd $0x55,%ymm15,%ymm13
1052 vmovdqa %ymm12,384-512(%rax)
1053 vpshufd $0xaa,%ymm15,%ymm14
1054 vmovdqa %ymm13,416-512(%rax)
1055 vpshufd $0xff,%ymm15,%ymm15
1056 vmovdqa %ymm14,448-512(%rax)
1057 vmovdqa %ymm15,480-512(%rax)
1058
1059 vpshufd $0x00,%ymm7,%ymm4
1060 vpshufd $0x55,%ymm7,%ymm5
1061 vpaddd .Lincy(%rip),%ymm4,%ymm4
1062 vpshufd $0xaa,%ymm7,%ymm6
1063 vmovdqa %ymm5,544-512(%rax)
1064 vpshufd $0xff,%ymm7,%ymm7
1065 vmovdqa %ymm6,576-512(%rax)
1066 vmovdqa %ymm7,608-512(%rax)
1067
1068 jmp .Loop_enter8x
1069
1070.align 32
1071.Loop_outer8x:
1072 vmovdqa 128-256(%rcx),%ymm8
1073 vmovdqa 160-256(%rcx),%ymm9
1074 vmovdqa 192-256(%rcx),%ymm10
1075 vmovdqa 224-256(%rcx),%ymm11
1076 vmovdqa 256-256(%rcx),%ymm0
1077 vmovdqa 288-256(%rcx),%ymm1
1078 vmovdqa 320-256(%rcx),%ymm2
1079 vmovdqa 352-256(%rcx),%ymm3
1080 vmovdqa 384-512(%rax),%ymm12
1081 vmovdqa 416-512(%rax),%ymm13
1082 vmovdqa 448-512(%rax),%ymm14
1083 vmovdqa 480-512(%rax),%ymm15
1084 vmovdqa 512-512(%rax),%ymm4
1085 vmovdqa 544-512(%rax),%ymm5
1086 vmovdqa 576-512(%rax),%ymm6
1087 vmovdqa 608-512(%rax),%ymm7
1088 vpaddd .Leight(%rip),%ymm4,%ymm4
1089
1090.Loop_enter8x:
1091 vmovdqa %ymm14,64(%rsp)
1092 vmovdqa %ymm15,96(%rsp)
1093 vbroadcasti128 (%r10),%ymm15
1094 vmovdqa %ymm4,512-512(%rax)
1095 movl $10,%eax
1096 jmp .Loop8x
1097
1098.align 32
1099.Loop8x:
1100 vpaddd %ymm0,%ymm8,%ymm8
1101 vpxor %ymm4,%ymm8,%ymm4
1102 vpshufb %ymm15,%ymm4,%ymm4
1103 vpaddd %ymm1,%ymm9,%ymm9
1104 vpxor %ymm5,%ymm9,%ymm5
1105 vpshufb %ymm15,%ymm5,%ymm5
1106 vpaddd %ymm4,%ymm12,%ymm12
1107 vpxor %ymm0,%ymm12,%ymm0
1108 vpslld $12,%ymm0,%ymm14
1109 vpsrld $20,%ymm0,%ymm0
1110 vpor %ymm0,%ymm14,%ymm0
1111 vbroadcasti128 (%r11),%ymm14
1112 vpaddd %ymm5,%ymm13,%ymm13
1113 vpxor %ymm1,%ymm13,%ymm1
1114 vpslld $12,%ymm1,%ymm15
1115 vpsrld $20,%ymm1,%ymm1
1116 vpor %ymm1,%ymm15,%ymm1
1117 vpaddd %ymm0,%ymm8,%ymm8
1118 vpxor %ymm4,%ymm8,%ymm4
1119 vpshufb %ymm14,%ymm4,%ymm4
1120 vpaddd %ymm1,%ymm9,%ymm9
1121 vpxor %ymm5,%ymm9,%ymm5
1122 vpshufb %ymm14,%ymm5,%ymm5
1123 vpaddd %ymm4,%ymm12,%ymm12
1124 vpxor %ymm0,%ymm12,%ymm0
1125 vpslld $7,%ymm0,%ymm15
1126 vpsrld $25,%ymm0,%ymm0
1127 vpor %ymm0,%ymm15,%ymm0
1128 vbroadcasti128 (%r10),%ymm15
1129 vpaddd %ymm5,%ymm13,%ymm13
1130 vpxor %ymm1,%ymm13,%ymm1
1131 vpslld $7,%ymm1,%ymm14
1132 vpsrld $25,%ymm1,%ymm1
1133 vpor %ymm1,%ymm14,%ymm1
1134 vmovdqa %ymm12,0(%rsp)
1135 vmovdqa %ymm13,32(%rsp)
1136 vmovdqa 64(%rsp),%ymm12
1137 vmovdqa 96(%rsp),%ymm13
1138 vpaddd %ymm2,%ymm10,%ymm10
1139 vpxor %ymm6,%ymm10,%ymm6
1140 vpshufb %ymm15,%ymm6,%ymm6
1141 vpaddd %ymm3,%ymm11,%ymm11
1142 vpxor %ymm7,%ymm11,%ymm7
1143 vpshufb %ymm15,%ymm7,%ymm7
1144 vpaddd %ymm6,%ymm12,%ymm12
1145 vpxor %ymm2,%ymm12,%ymm2
1146 vpslld $12,%ymm2,%ymm14
1147 vpsrld $20,%ymm2,%ymm2
1148 vpor %ymm2,%ymm14,%ymm2
1149 vbroadcasti128 (%r11),%ymm14
1150 vpaddd %ymm7,%ymm13,%ymm13
1151 vpxor %ymm3,%ymm13,%ymm3
1152 vpslld $12,%ymm3,%ymm15
1153 vpsrld $20,%ymm3,%ymm3
1154 vpor %ymm3,%ymm15,%ymm3
1155 vpaddd %ymm2,%ymm10,%ymm10
1156 vpxor %ymm6,%ymm10,%ymm6
1157 vpshufb %ymm14,%ymm6,%ymm6
1158 vpaddd %ymm3,%ymm11,%ymm11
1159 vpxor %ymm7,%ymm11,%ymm7
1160 vpshufb %ymm14,%ymm7,%ymm7
1161 vpaddd %ymm6,%ymm12,%ymm12
1162 vpxor %ymm2,%ymm12,%ymm2
1163 vpslld $7,%ymm2,%ymm15
1164 vpsrld $25,%ymm2,%ymm2
1165 vpor %ymm2,%ymm15,%ymm2
1166 vbroadcasti128 (%r10),%ymm15
1167 vpaddd %ymm7,%ymm13,%ymm13
1168 vpxor %ymm3,%ymm13,%ymm3
1169 vpslld $7,%ymm3,%ymm14
1170 vpsrld $25,%ymm3,%ymm3
1171 vpor %ymm3,%ymm14,%ymm3
1172 vpaddd %ymm1,%ymm8,%ymm8
1173 vpxor %ymm7,%ymm8,%ymm7
1174 vpshufb %ymm15,%ymm7,%ymm7
1175 vpaddd %ymm2,%ymm9,%ymm9
1176 vpxor %ymm4,%ymm9,%ymm4
1177 vpshufb %ymm15,%ymm4,%ymm4
1178 vpaddd %ymm7,%ymm12,%ymm12
1179 vpxor %ymm1,%ymm12,%ymm1
1180 vpslld $12,%ymm1,%ymm14
1181 vpsrld $20,%ymm1,%ymm1
1182 vpor %ymm1,%ymm14,%ymm1
1183 vbroadcasti128 (%r11),%ymm14
1184 vpaddd %ymm4,%ymm13,%ymm13
1185 vpxor %ymm2,%ymm13,%ymm2
1186 vpslld $12,%ymm2,%ymm15
1187 vpsrld $20,%ymm2,%ymm2
1188 vpor %ymm2,%ymm15,%ymm2
1189 vpaddd %ymm1,%ymm8,%ymm8
1190 vpxor %ymm7,%ymm8,%ymm7
1191 vpshufb %ymm14,%ymm7,%ymm7
1192 vpaddd %ymm2,%ymm9,%ymm9
1193 vpxor %ymm4,%ymm9,%ymm4
1194 vpshufb %ymm14,%ymm4,%ymm4
1195 vpaddd %ymm7,%ymm12,%ymm12
1196 vpxor %ymm1,%ymm12,%ymm1
1197 vpslld $7,%ymm1,%ymm15
1198 vpsrld $25,%ymm1,%ymm1
1199 vpor %ymm1,%ymm15,%ymm1
1200 vbroadcasti128 (%r10),%ymm15
1201 vpaddd %ymm4,%ymm13,%ymm13
1202 vpxor %ymm2,%ymm13,%ymm2
1203 vpslld $7,%ymm2,%ymm14
1204 vpsrld $25,%ymm2,%ymm2
1205 vpor %ymm2,%ymm14,%ymm2
1206 vmovdqa %ymm12,64(%rsp)
1207 vmovdqa %ymm13,96(%rsp)
1208 vmovdqa 0(%rsp),%ymm12
1209 vmovdqa 32(%rsp),%ymm13
1210 vpaddd %ymm3,%ymm10,%ymm10
1211 vpxor %ymm5,%ymm10,%ymm5
1212 vpshufb %ymm15,%ymm5,%ymm5
1213 vpaddd %ymm0,%ymm11,%ymm11
1214 vpxor %ymm6,%ymm11,%ymm6
1215 vpshufb %ymm15,%ymm6,%ymm6
1216 vpaddd %ymm5,%ymm12,%ymm12
1217 vpxor %ymm3,%ymm12,%ymm3
1218 vpslld $12,%ymm3,%ymm14
1219 vpsrld $20,%ymm3,%ymm3
1220 vpor %ymm3,%ymm14,%ymm3
1221 vbroadcasti128 (%r11),%ymm14
1222 vpaddd %ymm6,%ymm13,%ymm13
1223 vpxor %ymm0,%ymm13,%ymm0
1224 vpslld $12,%ymm0,%ymm15
1225 vpsrld $20,%ymm0,%ymm0
1226 vpor %ymm0,%ymm15,%ymm0
1227 vpaddd %ymm3,%ymm10,%ymm10
1228 vpxor %ymm5,%ymm10,%ymm5
1229 vpshufb %ymm14,%ymm5,%ymm5
1230 vpaddd %ymm0,%ymm11,%ymm11
1231 vpxor %ymm6,%ymm11,%ymm6
1232 vpshufb %ymm14,%ymm6,%ymm6
1233 vpaddd %ymm5,%ymm12,%ymm12
1234 vpxor %ymm3,%ymm12,%ymm3
1235 vpslld $7,%ymm3,%ymm15
1236 vpsrld $25,%ymm3,%ymm3
1237 vpor %ymm3,%ymm15,%ymm3
1238 vbroadcasti128 (%r10),%ymm15
1239 vpaddd %ymm6,%ymm13,%ymm13
1240 vpxor %ymm0,%ymm13,%ymm0
1241 vpslld $7,%ymm0,%ymm14
1242 vpsrld $25,%ymm0,%ymm0
1243 vpor %ymm0,%ymm14,%ymm0
1244 decl %eax
1245 jnz .Loop8x
1246
1247 leaq 512(%rsp),%rax
1248 vpaddd 128-256(%rcx),%ymm8,%ymm8
1249 vpaddd 160-256(%rcx),%ymm9,%ymm9
1250 vpaddd 192-256(%rcx),%ymm10,%ymm10
1251 vpaddd 224-256(%rcx),%ymm11,%ymm11
1252
1253 vpunpckldq %ymm9,%ymm8,%ymm14
1254 vpunpckldq %ymm11,%ymm10,%ymm15
1255 vpunpckhdq %ymm9,%ymm8,%ymm8
1256 vpunpckhdq %ymm11,%ymm10,%ymm10
1257 vpunpcklqdq %ymm15,%ymm14,%ymm9
1258 vpunpckhqdq %ymm15,%ymm14,%ymm14
1259 vpunpcklqdq %ymm10,%ymm8,%ymm11
1260 vpunpckhqdq %ymm10,%ymm8,%ymm8
1261 vpaddd 256-256(%rcx),%ymm0,%ymm0
1262 vpaddd 288-256(%rcx),%ymm1,%ymm1
1263 vpaddd 320-256(%rcx),%ymm2,%ymm2
1264 vpaddd 352-256(%rcx),%ymm3,%ymm3
1265
1266 vpunpckldq %ymm1,%ymm0,%ymm10
1267 vpunpckldq %ymm3,%ymm2,%ymm15
1268 vpunpckhdq %ymm1,%ymm0,%ymm0
1269 vpunpckhdq %ymm3,%ymm2,%ymm2
1270 vpunpcklqdq %ymm15,%ymm10,%ymm1
1271 vpunpckhqdq %ymm15,%ymm10,%ymm10
1272 vpunpcklqdq %ymm2,%ymm0,%ymm3
1273 vpunpckhqdq %ymm2,%ymm0,%ymm0
1274 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
1275 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
1276 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
1277 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
1278 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
1279 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
1280 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
1281 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
1282 vmovdqa %ymm15,0(%rsp)
1283 vmovdqa %ymm9,32(%rsp)
1284 vmovdqa 64(%rsp),%ymm15
1285 vmovdqa 96(%rsp),%ymm9
1286
1287 vpaddd 384-512(%rax),%ymm12,%ymm12
1288 vpaddd 416-512(%rax),%ymm13,%ymm13
1289 vpaddd 448-512(%rax),%ymm15,%ymm15
1290 vpaddd 480-512(%rax),%ymm9,%ymm9
1291
1292 vpunpckldq %ymm13,%ymm12,%ymm2
1293 vpunpckldq %ymm9,%ymm15,%ymm8
1294 vpunpckhdq %ymm13,%ymm12,%ymm12
1295 vpunpckhdq %ymm9,%ymm15,%ymm15
1296 vpunpcklqdq %ymm8,%ymm2,%ymm13
1297 vpunpckhqdq %ymm8,%ymm2,%ymm2
1298 vpunpcklqdq %ymm15,%ymm12,%ymm9
1299 vpunpckhqdq %ymm15,%ymm12,%ymm12
1300 vpaddd 512-512(%rax),%ymm4,%ymm4
1301 vpaddd 544-512(%rax),%ymm5,%ymm5
1302 vpaddd 576-512(%rax),%ymm6,%ymm6
1303 vpaddd 608-512(%rax),%ymm7,%ymm7
1304
1305 vpunpckldq %ymm5,%ymm4,%ymm15
1306 vpunpckldq %ymm7,%ymm6,%ymm8
1307 vpunpckhdq %ymm5,%ymm4,%ymm4
1308 vpunpckhdq %ymm7,%ymm6,%ymm6
1309 vpunpcklqdq %ymm8,%ymm15,%ymm5
1310 vpunpckhqdq %ymm8,%ymm15,%ymm15
1311 vpunpcklqdq %ymm6,%ymm4,%ymm7
1312 vpunpckhqdq %ymm6,%ymm4,%ymm4
1313 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
1314 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
1315 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
1316 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
1317 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
1318 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
1319 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
1320 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
1321 vmovdqa 0(%rsp),%ymm6
1322 vmovdqa 32(%rsp),%ymm12
1323
1324 cmpq $512,%rdx
1325 jb .Ltail8x
1326
1327 vpxor 0(%rsi),%ymm6,%ymm6
1328 vpxor 32(%rsi),%ymm8,%ymm8
1329 vpxor 64(%rsi),%ymm1,%ymm1
1330 vpxor 96(%rsi),%ymm5,%ymm5
1331 leaq 128(%rsi),%rsi
1332 vmovdqu %ymm6,0(%rdi)
1333 vmovdqu %ymm8,32(%rdi)
1334 vmovdqu %ymm1,64(%rdi)
1335 vmovdqu %ymm5,96(%rdi)
1336 leaq 128(%rdi),%rdi
1337
1338 vpxor 0(%rsi),%ymm12,%ymm12
1339 vpxor 32(%rsi),%ymm13,%ymm13
1340 vpxor 64(%rsi),%ymm10,%ymm10
1341 vpxor 96(%rsi),%ymm15,%ymm15
1342 leaq 128(%rsi),%rsi
1343 vmovdqu %ymm12,0(%rdi)
1344 vmovdqu %ymm13,32(%rdi)
1345 vmovdqu %ymm10,64(%rdi)
1346 vmovdqu %ymm15,96(%rdi)
1347 leaq 128(%rdi),%rdi
1348
1349 vpxor 0(%rsi),%ymm14,%ymm14
1350 vpxor 32(%rsi),%ymm2,%ymm2
1351 vpxor 64(%rsi),%ymm3,%ymm3
1352 vpxor 96(%rsi),%ymm7,%ymm7
1353 leaq 128(%rsi),%rsi
1354 vmovdqu %ymm14,0(%rdi)
1355 vmovdqu %ymm2,32(%rdi)
1356 vmovdqu %ymm3,64(%rdi)
1357 vmovdqu %ymm7,96(%rdi)
1358 leaq 128(%rdi),%rdi
1359
1360 vpxor 0(%rsi),%ymm11,%ymm11
1361 vpxor 32(%rsi),%ymm9,%ymm9
1362 vpxor 64(%rsi),%ymm0,%ymm0
1363 vpxor 96(%rsi),%ymm4,%ymm4
1364 leaq 128(%rsi),%rsi
1365 vmovdqu %ymm11,0(%rdi)
1366 vmovdqu %ymm9,32(%rdi)
1367 vmovdqu %ymm0,64(%rdi)
1368 vmovdqu %ymm4,96(%rdi)
1369 leaq 128(%rdi),%rdi
1370
1371 subq $512,%rdx
1372 jnz .Loop_outer8x
1373
1374 jmp .Ldone8x
1375
1376.Ltail8x:
1377 cmpq $448,%rdx
1378 jae .L448_or_more8x
1379 cmpq $384,%rdx
1380 jae .L384_or_more8x
1381 cmpq $320,%rdx
1382 jae .L320_or_more8x
1383 cmpq $256,%rdx
1384 jae .L256_or_more8x
1385 cmpq $192,%rdx
1386 jae .L192_or_more8x
1387 cmpq $128,%rdx
1388 jae .L128_or_more8x
1389 cmpq $64,%rdx
1390 jae .L64_or_more8x
1391
1392 xorq %r10,%r10
1393 vmovdqa %ymm6,0(%rsp)
1394 vmovdqa %ymm8,32(%rsp)
1395 jmp .Loop_tail8x
1396
1397.align 32
1398.L64_or_more8x:
1399 vpxor 0(%rsi),%ymm6,%ymm6
1400 vpxor 32(%rsi),%ymm8,%ymm8
1401 vmovdqu %ymm6,0(%rdi)
1402 vmovdqu %ymm8,32(%rdi)
1403 je .Ldone8x
1404
1405 leaq 64(%rsi),%rsi
1406 xorq %r10,%r10
1407 vmovdqa %ymm1,0(%rsp)
1408 leaq 64(%rdi),%rdi
1409 subq $64,%rdx
1410 vmovdqa %ymm5,32(%rsp)
1411 jmp .Loop_tail8x
1412
1413.align 32
1414.L128_or_more8x:
1415 vpxor 0(%rsi),%ymm6,%ymm6
1416 vpxor 32(%rsi),%ymm8,%ymm8
1417 vpxor 64(%rsi),%ymm1,%ymm1
1418 vpxor 96(%rsi),%ymm5,%ymm5
1419 vmovdqu %ymm6,0(%rdi)
1420 vmovdqu %ymm8,32(%rdi)
1421 vmovdqu %ymm1,64(%rdi)
1422 vmovdqu %ymm5,96(%rdi)
1423 je .Ldone8x
1424
1425 leaq 128(%rsi),%rsi
1426 xorq %r10,%r10
1427 vmovdqa %ymm12,0(%rsp)
1428 leaq 128(%rdi),%rdi
1429 subq $128,%rdx
1430 vmovdqa %ymm13,32(%rsp)
1431 jmp .Loop_tail8x
1432
1433.align 32
1434.L192_or_more8x:
1435 vpxor 0(%rsi),%ymm6,%ymm6
1436 vpxor 32(%rsi),%ymm8,%ymm8
1437 vpxor 64(%rsi),%ymm1,%ymm1
1438 vpxor 96(%rsi),%ymm5,%ymm5
1439 vpxor 128(%rsi),%ymm12,%ymm12
1440 vpxor 160(%rsi),%ymm13,%ymm13
1441 vmovdqu %ymm6,0(%rdi)
1442 vmovdqu %ymm8,32(%rdi)
1443 vmovdqu %ymm1,64(%rdi)
1444 vmovdqu %ymm5,96(%rdi)
1445 vmovdqu %ymm12,128(%rdi)
1446 vmovdqu %ymm13,160(%rdi)
1447 je .Ldone8x
1448
1449 leaq 192(%rsi),%rsi
1450 xorq %r10,%r10
1451 vmovdqa %ymm10,0(%rsp)
1452 leaq 192(%rdi),%rdi
1453 subq $192,%rdx
1454 vmovdqa %ymm15,32(%rsp)
1455 jmp .Loop_tail8x
1456
1457.align 32
1458.L256_or_more8x:
1459 vpxor 0(%rsi),%ymm6,%ymm6
1460 vpxor 32(%rsi),%ymm8,%ymm8
1461 vpxor 64(%rsi),%ymm1,%ymm1
1462 vpxor 96(%rsi),%ymm5,%ymm5
1463 vpxor 128(%rsi),%ymm12,%ymm12
1464 vpxor 160(%rsi),%ymm13,%ymm13
1465 vpxor 192(%rsi),%ymm10,%ymm10
1466 vpxor 224(%rsi),%ymm15,%ymm15
1467 vmovdqu %ymm6,0(%rdi)
1468 vmovdqu %ymm8,32(%rdi)
1469 vmovdqu %ymm1,64(%rdi)
1470 vmovdqu %ymm5,96(%rdi)
1471 vmovdqu %ymm12,128(%rdi)
1472 vmovdqu %ymm13,160(%rdi)
1473 vmovdqu %ymm10,192(%rdi)
1474 vmovdqu %ymm15,224(%rdi)
1475 je .Ldone8x
1476
1477 leaq 256(%rsi),%rsi
1478 xorq %r10,%r10
1479 vmovdqa %ymm14,0(%rsp)
1480 leaq 256(%rdi),%rdi
1481 subq $256,%rdx
1482 vmovdqa %ymm2,32(%rsp)
1483 jmp .Loop_tail8x
1484
1485.align 32
1486.L320_or_more8x:
1487 vpxor 0(%rsi),%ymm6,%ymm6
1488 vpxor 32(%rsi),%ymm8,%ymm8
1489 vpxor 64(%rsi),%ymm1,%ymm1
1490 vpxor 96(%rsi),%ymm5,%ymm5
1491 vpxor 128(%rsi),%ymm12,%ymm12
1492 vpxor 160(%rsi),%ymm13,%ymm13
1493 vpxor 192(%rsi),%ymm10,%ymm10
1494 vpxor 224(%rsi),%ymm15,%ymm15
1495 vpxor 256(%rsi),%ymm14,%ymm14
1496 vpxor 288(%rsi),%ymm2,%ymm2
1497 vmovdqu %ymm6,0(%rdi)
1498 vmovdqu %ymm8,32(%rdi)
1499 vmovdqu %ymm1,64(%rdi)
1500 vmovdqu %ymm5,96(%rdi)
1501 vmovdqu %ymm12,128(%rdi)
1502 vmovdqu %ymm13,160(%rdi)
1503 vmovdqu %ymm10,192(%rdi)
1504 vmovdqu %ymm15,224(%rdi)
1505 vmovdqu %ymm14,256(%rdi)
1506 vmovdqu %ymm2,288(%rdi)
1507 je .Ldone8x
1508
1509 leaq 320(%rsi),%rsi
1510 xorq %r10,%r10
1511 vmovdqa %ymm3,0(%rsp)
1512 leaq 320(%rdi),%rdi
1513 subq $320,%rdx
1514 vmovdqa %ymm7,32(%rsp)
1515 jmp .Loop_tail8x
1516
1517.align 32
1518.L384_or_more8x:
1519 vpxor 0(%rsi),%ymm6,%ymm6
1520 vpxor 32(%rsi),%ymm8,%ymm8
1521 vpxor 64(%rsi),%ymm1,%ymm1
1522 vpxor 96(%rsi),%ymm5,%ymm5
1523 vpxor 128(%rsi),%ymm12,%ymm12
1524 vpxor 160(%rsi),%ymm13,%ymm13
1525 vpxor 192(%rsi),%ymm10,%ymm10
1526 vpxor 224(%rsi),%ymm15,%ymm15
1527 vpxor 256(%rsi),%ymm14,%ymm14
1528 vpxor 288(%rsi),%ymm2,%ymm2
1529 vpxor 320(%rsi),%ymm3,%ymm3
1530 vpxor 352(%rsi),%ymm7,%ymm7
1531 vmovdqu %ymm6,0(%rdi)
1532 vmovdqu %ymm8,32(%rdi)
1533 vmovdqu %ymm1,64(%rdi)
1534 vmovdqu %ymm5,96(%rdi)
1535 vmovdqu %ymm12,128(%rdi)
1536 vmovdqu %ymm13,160(%rdi)
1537 vmovdqu %ymm10,192(%rdi)
1538 vmovdqu %ymm15,224(%rdi)
1539 vmovdqu %ymm14,256(%rdi)
1540 vmovdqu %ymm2,288(%rdi)
1541 vmovdqu %ymm3,320(%rdi)
1542 vmovdqu %ymm7,352(%rdi)
1543 je .Ldone8x
1544
1545 leaq 384(%rsi),%rsi
1546 xorq %r10,%r10
1547 vmovdqa %ymm11,0(%rsp)
1548 leaq 384(%rdi),%rdi
1549 subq $384,%rdx
1550 vmovdqa %ymm9,32(%rsp)
1551 jmp .Loop_tail8x
1552
1553.align 32
1554.L448_or_more8x:
1555 vpxor 0(%rsi),%ymm6,%ymm6
1556 vpxor 32(%rsi),%ymm8,%ymm8
1557 vpxor 64(%rsi),%ymm1,%ymm1
1558 vpxor 96(%rsi),%ymm5,%ymm5
1559 vpxor 128(%rsi),%ymm12,%ymm12
1560 vpxor 160(%rsi),%ymm13,%ymm13
1561 vpxor 192(%rsi),%ymm10,%ymm10
1562 vpxor 224(%rsi),%ymm15,%ymm15
1563 vpxor 256(%rsi),%ymm14,%ymm14
1564 vpxor 288(%rsi),%ymm2,%ymm2
1565 vpxor 320(%rsi),%ymm3,%ymm3
1566 vpxor 352(%rsi),%ymm7,%ymm7
1567 vpxor 384(%rsi),%ymm11,%ymm11
1568 vpxor 416(%rsi),%ymm9,%ymm9
1569 vmovdqu %ymm6,0(%rdi)
1570 vmovdqu %ymm8,32(%rdi)
1571 vmovdqu %ymm1,64(%rdi)
1572 vmovdqu %ymm5,96(%rdi)
1573 vmovdqu %ymm12,128(%rdi)
1574 vmovdqu %ymm13,160(%rdi)
1575 vmovdqu %ymm10,192(%rdi)
1576 vmovdqu %ymm15,224(%rdi)
1577 vmovdqu %ymm14,256(%rdi)
1578 vmovdqu %ymm2,288(%rdi)
1579 vmovdqu %ymm3,320(%rdi)
1580 vmovdqu %ymm7,352(%rdi)
1581 vmovdqu %ymm11,384(%rdi)
1582 vmovdqu %ymm9,416(%rdi)
1583 je .Ldone8x
1584
1585 leaq 448(%rsi),%rsi
1586 xorq %r10,%r10
1587 vmovdqa %ymm0,0(%rsp)
1588 leaq 448(%rdi),%rdi
1589 subq $448,%rdx
1590 vmovdqa %ymm4,32(%rsp)
1591
1592.Loop_tail8x:
1593 movzbl (%rsi,%r10,1),%eax
1594 movzbl (%rsp,%r10,1),%ecx
1595 leaq 1(%r10),%r10
1596 xorl %ecx,%eax
1597 movb %al,-1(%rdi,%r10,1)
1598 decq %rdx
1599 jnz .Loop_tail8x
1600
1601.Ldone8x:
1602 vzeroall
1603 leaq (%r9),%rsp
1604.cfi_def_cfa_register rsp
1605.L8x_epilogue:
1606 ret
1607.cfi_endproc
1608.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
1609#endif