blob: 5cf770f1d55bcf5c7ad0e8d7e785cc74afb5a56d [file] [log] [blame]
David Benjaminfe0c91e2024-03-18 15:37:24 +10001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
7.text
8
David Benjamin3efe2eb2024-05-08 22:24:27 -07009.globl _bn_mul_mont_gather5_nohw
10.private_extern _bn_mul_mont_gather5_nohw
David Benjaminfe0c91e2024-03-18 15:37:24 +100011
12.p2align 6
David Benjamin3efe2eb2024-05-08 22:24:27 -070013_bn_mul_mont_gather5_nohw:
David Benjaminfe0c91e2024-03-18 15:37:24 +100014
15_CET_ENDBR
David Benjamin3efe2eb2024-05-08 22:24:27 -070016
17
David Benjaminfe0c91e2024-03-18 15:37:24 +100018 movl %r9d,%r9d
19 movq %rsp,%rax
20
David Benjaminfe0c91e2024-03-18 15:37:24 +100021 movd 8(%rsp),%xmm5
22 pushq %rbx
23
24 pushq %rbp
25
26 pushq %r12
27
28 pushq %r13
29
30 pushq %r14
31
32 pushq %r15
33
34
35 negq %r9
36 movq %rsp,%r11
37 leaq -280(%rsp,%r9,8),%r10
38 negq %r9
39 andq $-1024,%r10
40
41
42
43
44
45
46
47
48
49 subq %r10,%r11
50 andq $-4096,%r11
51 leaq (%r10,%r11,1),%rsp
52 movq (%rsp),%r11
53 cmpq %r10,%rsp
54 ja L$mul_page_walk
55 jmp L$mul_page_walk_done
56
57L$mul_page_walk:
58 leaq -4096(%rsp),%rsp
59 movq (%rsp),%r11
60 cmpq %r10,%rsp
61 ja L$mul_page_walk
62L$mul_page_walk_done:
63
64 leaq L$inc(%rip),%r10
65 movq %rax,8(%rsp,%r9,8)
66
67L$mul_body:
68
69 leaq 128(%rdx),%r12
70 movdqa 0(%r10),%xmm0
71 movdqa 16(%r10),%xmm1
72 leaq 24-112(%rsp,%r9,8),%r10
73 andq $-16,%r10
74
75 pshufd $0,%xmm5,%xmm5
76 movdqa %xmm1,%xmm4
77 movdqa %xmm1,%xmm2
78 paddd %xmm0,%xmm1
79 pcmpeqd %xmm5,%xmm0
80.byte 0x67
81 movdqa %xmm4,%xmm3
82 paddd %xmm1,%xmm2
83 pcmpeqd %xmm5,%xmm1
84 movdqa %xmm0,112(%r10)
85 movdqa %xmm4,%xmm0
86
87 paddd %xmm2,%xmm3
88 pcmpeqd %xmm5,%xmm2
89 movdqa %xmm1,128(%r10)
90 movdqa %xmm4,%xmm1
91
92 paddd %xmm3,%xmm0
93 pcmpeqd %xmm5,%xmm3
94 movdqa %xmm2,144(%r10)
95 movdqa %xmm4,%xmm2
96
97 paddd %xmm0,%xmm1
98 pcmpeqd %xmm5,%xmm0
99 movdqa %xmm3,160(%r10)
100 movdqa %xmm4,%xmm3
101 paddd %xmm1,%xmm2
102 pcmpeqd %xmm5,%xmm1
103 movdqa %xmm0,176(%r10)
104 movdqa %xmm4,%xmm0
105
106 paddd %xmm2,%xmm3
107 pcmpeqd %xmm5,%xmm2
108 movdqa %xmm1,192(%r10)
109 movdqa %xmm4,%xmm1
110
111 paddd %xmm3,%xmm0
112 pcmpeqd %xmm5,%xmm3
113 movdqa %xmm2,208(%r10)
114 movdqa %xmm4,%xmm2
115
116 paddd %xmm0,%xmm1
117 pcmpeqd %xmm5,%xmm0
118 movdqa %xmm3,224(%r10)
119 movdqa %xmm4,%xmm3
120 paddd %xmm1,%xmm2
121 pcmpeqd %xmm5,%xmm1
122 movdqa %xmm0,240(%r10)
123 movdqa %xmm4,%xmm0
124
125 paddd %xmm2,%xmm3
126 pcmpeqd %xmm5,%xmm2
127 movdqa %xmm1,256(%r10)
128 movdqa %xmm4,%xmm1
129
130 paddd %xmm3,%xmm0
131 pcmpeqd %xmm5,%xmm3
132 movdqa %xmm2,272(%r10)
133 movdqa %xmm4,%xmm2
134
135 paddd %xmm0,%xmm1
136 pcmpeqd %xmm5,%xmm0
137 movdqa %xmm3,288(%r10)
138 movdqa %xmm4,%xmm3
139 paddd %xmm1,%xmm2
140 pcmpeqd %xmm5,%xmm1
141 movdqa %xmm0,304(%r10)
142
143 paddd %xmm2,%xmm3
144.byte 0x67
145 pcmpeqd %xmm5,%xmm2
146 movdqa %xmm1,320(%r10)
147
148 pcmpeqd %xmm5,%xmm3
149 movdqa %xmm2,336(%r10)
150 pand 64(%r12),%xmm0
151
152 pand 80(%r12),%xmm1
153 pand 96(%r12),%xmm2
154 movdqa %xmm3,352(%r10)
155 pand 112(%r12),%xmm3
156 por %xmm2,%xmm0
157 por %xmm3,%xmm1
158 movdqa -128(%r12),%xmm4
159 movdqa -112(%r12),%xmm5
160 movdqa -96(%r12),%xmm2
161 pand 112(%r10),%xmm4
162 movdqa -80(%r12),%xmm3
163 pand 128(%r10),%xmm5
164 por %xmm4,%xmm0
165 pand 144(%r10),%xmm2
166 por %xmm5,%xmm1
167 pand 160(%r10),%xmm3
168 por %xmm2,%xmm0
169 por %xmm3,%xmm1
170 movdqa -64(%r12),%xmm4
171 movdqa -48(%r12),%xmm5
172 movdqa -32(%r12),%xmm2
173 pand 176(%r10),%xmm4
174 movdqa -16(%r12),%xmm3
175 pand 192(%r10),%xmm5
176 por %xmm4,%xmm0
177 pand 208(%r10),%xmm2
178 por %xmm5,%xmm1
179 pand 224(%r10),%xmm3
180 por %xmm2,%xmm0
181 por %xmm3,%xmm1
182 movdqa 0(%r12),%xmm4
183 movdqa 16(%r12),%xmm5
184 movdqa 32(%r12),%xmm2
185 pand 240(%r10),%xmm4
186 movdqa 48(%r12),%xmm3
187 pand 256(%r10),%xmm5
188 por %xmm4,%xmm0
189 pand 272(%r10),%xmm2
190 por %xmm5,%xmm1
191 pand 288(%r10),%xmm3
192 por %xmm2,%xmm0
193 por %xmm3,%xmm1
194 por %xmm1,%xmm0
195
196 pshufd $0x4e,%xmm0,%xmm1
197 por %xmm1,%xmm0
198 leaq 256(%r12),%r12
199.byte 102,72,15,126,195
200
201 movq (%r8),%r8
202 movq (%rsi),%rax
203
204 xorq %r14,%r14
205 xorq %r15,%r15
206
207 movq %r8,%rbp
208 mulq %rbx
209 movq %rax,%r10
210 movq (%rcx),%rax
211
212 imulq %r10,%rbp
213 movq %rdx,%r11
214
215 mulq %rbp
216 addq %rax,%r10
217 movq 8(%rsi),%rax
218 adcq $0,%rdx
219 movq %rdx,%r13
220
221 leaq 1(%r15),%r15
222 jmp L$1st_enter
223
224.p2align 4
225L$1st:
226 addq %rax,%r13
227 movq (%rsi,%r15,8),%rax
228 adcq $0,%rdx
229 addq %r11,%r13
230 movq %r10,%r11
231 adcq $0,%rdx
232 movq %r13,-16(%rsp,%r15,8)
233 movq %rdx,%r13
234
235L$1st_enter:
236 mulq %rbx
237 addq %rax,%r11
238 movq (%rcx,%r15,8),%rax
239 adcq $0,%rdx
240 leaq 1(%r15),%r15
241 movq %rdx,%r10
242
243 mulq %rbp
244 cmpq %r9,%r15
245 jne L$1st
246
247
248 addq %rax,%r13
249 adcq $0,%rdx
250 addq %r11,%r13
251 adcq $0,%rdx
252 movq %r13,-16(%rsp,%r9,8)
253 movq %rdx,%r13
254 movq %r10,%r11
255
256 xorq %rdx,%rdx
257 addq %r11,%r13
258 adcq $0,%rdx
259 movq %r13,-8(%rsp,%r9,8)
260 movq %rdx,(%rsp,%r9,8)
261
262 leaq 1(%r14),%r14
263 jmp L$outer
264.p2align 4
265L$outer:
266 leaq 24+128(%rsp,%r9,8),%rdx
267 andq $-16,%rdx
268 pxor %xmm4,%xmm4
269 pxor %xmm5,%xmm5
270 movdqa -128(%r12),%xmm0
271 movdqa -112(%r12),%xmm1
272 movdqa -96(%r12),%xmm2
273 movdqa -80(%r12),%xmm3
274 pand -128(%rdx),%xmm0
275 pand -112(%rdx),%xmm1
276 por %xmm0,%xmm4
277 pand -96(%rdx),%xmm2
278 por %xmm1,%xmm5
279 pand -80(%rdx),%xmm3
280 por %xmm2,%xmm4
281 por %xmm3,%xmm5
282 movdqa -64(%r12),%xmm0
283 movdqa -48(%r12),%xmm1
284 movdqa -32(%r12),%xmm2
285 movdqa -16(%r12),%xmm3
286 pand -64(%rdx),%xmm0
287 pand -48(%rdx),%xmm1
288 por %xmm0,%xmm4
289 pand -32(%rdx),%xmm2
290 por %xmm1,%xmm5
291 pand -16(%rdx),%xmm3
292 por %xmm2,%xmm4
293 por %xmm3,%xmm5
294 movdqa 0(%r12),%xmm0
295 movdqa 16(%r12),%xmm1
296 movdqa 32(%r12),%xmm2
297 movdqa 48(%r12),%xmm3
298 pand 0(%rdx),%xmm0
299 pand 16(%rdx),%xmm1
300 por %xmm0,%xmm4
301 pand 32(%rdx),%xmm2
302 por %xmm1,%xmm5
303 pand 48(%rdx),%xmm3
304 por %xmm2,%xmm4
305 por %xmm3,%xmm5
306 movdqa 64(%r12),%xmm0
307 movdqa 80(%r12),%xmm1
308 movdqa 96(%r12),%xmm2
309 movdqa 112(%r12),%xmm3
310 pand 64(%rdx),%xmm0
311 pand 80(%rdx),%xmm1
312 por %xmm0,%xmm4
313 pand 96(%rdx),%xmm2
314 por %xmm1,%xmm5
315 pand 112(%rdx),%xmm3
316 por %xmm2,%xmm4
317 por %xmm3,%xmm5
318 por %xmm5,%xmm4
319
320 pshufd $0x4e,%xmm4,%xmm0
321 por %xmm4,%xmm0
322 leaq 256(%r12),%r12
323
324 movq (%rsi),%rax
325.byte 102,72,15,126,195
326
327 xorq %r15,%r15
328 movq %r8,%rbp
329 movq (%rsp),%r10
330
331 mulq %rbx
332 addq %rax,%r10
333 movq (%rcx),%rax
334 adcq $0,%rdx
335
336 imulq %r10,%rbp
337 movq %rdx,%r11
338
339 mulq %rbp
340 addq %rax,%r10
341 movq 8(%rsi),%rax
342 adcq $0,%rdx
343 movq 8(%rsp),%r10
344 movq %rdx,%r13
345
346 leaq 1(%r15),%r15
347 jmp L$inner_enter
348
349.p2align 4
350L$inner:
351 addq %rax,%r13
352 movq (%rsi,%r15,8),%rax
353 adcq $0,%rdx
354 addq %r10,%r13
355 movq (%rsp,%r15,8),%r10
356 adcq $0,%rdx
357 movq %r13,-16(%rsp,%r15,8)
358 movq %rdx,%r13
359
360L$inner_enter:
361 mulq %rbx
362 addq %rax,%r11
363 movq (%rcx,%r15,8),%rax
364 adcq $0,%rdx
365 addq %r11,%r10
366 movq %rdx,%r11
367 adcq $0,%r11
368 leaq 1(%r15),%r15
369
370 mulq %rbp
371 cmpq %r9,%r15
372 jne L$inner
373
374 addq %rax,%r13
375 adcq $0,%rdx
376 addq %r10,%r13
377 movq (%rsp,%r9,8),%r10
378 adcq $0,%rdx
379 movq %r13,-16(%rsp,%r9,8)
380 movq %rdx,%r13
381
382 xorq %rdx,%rdx
383 addq %r11,%r13
384 adcq $0,%rdx
385 addq %r10,%r13
386 adcq $0,%rdx
387 movq %r13,-8(%rsp,%r9,8)
388 movq %rdx,(%rsp,%r9,8)
389
390 leaq 1(%r14),%r14
391 cmpq %r9,%r14
392 jb L$outer
393
394 xorq %r14,%r14
395 movq (%rsp),%rax
396 leaq (%rsp),%rsi
397 movq %r9,%r15
398 jmp L$sub
399.p2align 4
400L$sub: sbbq (%rcx,%r14,8),%rax
401 movq %rax,(%rdi,%r14,8)
402 movq 8(%rsi,%r14,8),%rax
403 leaq 1(%r14),%r14
404 decq %r15
405 jnz L$sub
406
407 sbbq $0,%rax
408 movq $-1,%rbx
409 xorq %rax,%rbx
410 xorq %r14,%r14
411 movq %r9,%r15
412
413L$copy:
414 movq (%rdi,%r14,8),%rcx
415 movq (%rsp,%r14,8),%rdx
416 andq %rbx,%rcx
417 andq %rax,%rdx
418 movq %r14,(%rsp,%r14,8)
419 orq %rcx,%rdx
420 movq %rdx,(%rdi,%r14,8)
421 leaq 1(%r14),%r14
422 subq $1,%r15
423 jnz L$copy
424
425 movq 8(%rsp,%r9,8),%rsi
426
427 movq $1,%rax
428
429 movq -48(%rsi),%r15
430
431 movq -40(%rsi),%r14
432
433 movq -32(%rsi),%r13
434
435 movq -24(%rsi),%r12
436
437 movq -16(%rsi),%rbp
438
439 movq -8(%rsi),%rbx
440
441 leaq (%rsi),%rsp
442
443L$mul_epilogue:
444 ret
445
446
David Benjamin3efe2eb2024-05-08 22:24:27 -0700447.globl _bn_mul4x_mont_gather5
448.private_extern _bn_mul4x_mont_gather5
David Benjaminfe0c91e2024-03-18 15:37:24 +1000449
450.p2align 5
David Benjamin3efe2eb2024-05-08 22:24:27 -0700451_bn_mul4x_mont_gather5:
David Benjaminfe0c91e2024-03-18 15:37:24 +1000452
David Benjamin3efe2eb2024-05-08 22:24:27 -0700453_CET_ENDBR
David Benjaminfe0c91e2024-03-18 15:37:24 +1000454.byte 0x67
455 movq %rsp,%rax
456
David Benjaminfe0c91e2024-03-18 15:37:24 +1000457 pushq %rbx
458
459 pushq %rbp
460
461 pushq %r12
462
463 pushq %r13
464
465 pushq %r14
466
467 pushq %r15
468
469L$mul4x_prologue:
470
471.byte 0x67
David Benjamin3efe2eb2024-05-08 22:24:27 -0700472
473
474
David Benjaminfe0c91e2024-03-18 15:37:24 +1000475 shll $3,%r9d
476 leaq (%r9,%r9,2),%r10
477 negq %r9
478
479
480
481
482
483
484
485
486
487
488 leaq -320(%rsp,%r9,2),%r11
489 movq %rsp,%rbp
490 subq %rdi,%r11
491 andq $4095,%r11
492 cmpq %r11,%r10
493 jb L$mul4xsp_alt
494 subq %r11,%rbp
495 leaq -320(%rbp,%r9,2),%rbp
496 jmp L$mul4xsp_done
497
498.p2align 5
499L$mul4xsp_alt:
500 leaq 4096-320(,%r9,2),%r10
501 leaq -320(%rbp,%r9,2),%rbp
502 subq %r10,%r11
503 movq $0,%r10
504 cmovcq %r10,%r11
505 subq %r11,%rbp
506L$mul4xsp_done:
507 andq $-64,%rbp
508 movq %rsp,%r11
509 subq %rbp,%r11
510 andq $-4096,%r11
511 leaq (%r11,%rbp,1),%rsp
512 movq (%rsp),%r10
513 cmpq %rbp,%rsp
514 ja L$mul4x_page_walk
515 jmp L$mul4x_page_walk_done
516
517L$mul4x_page_walk:
518 leaq -4096(%rsp),%rsp
519 movq (%rsp),%r10
520 cmpq %rbp,%rsp
521 ja L$mul4x_page_walk
522L$mul4x_page_walk_done:
523
524 negq %r9
525
526 movq %rax,40(%rsp)
527
528L$mul4x_body:
529
530 call mul4x_internal
531
532 movq 40(%rsp),%rsi
533
534 movq $1,%rax
535
536 movq -48(%rsi),%r15
537
538 movq -40(%rsi),%r14
539
540 movq -32(%rsi),%r13
541
542 movq -24(%rsi),%r12
543
544 movq -16(%rsi),%rbp
545
546 movq -8(%rsi),%rbx
547
548 leaq (%rsi),%rsp
549
550L$mul4x_epilogue:
551 ret
552
553
554
555
556.p2align 5
557mul4x_internal:
558
559 shlq $5,%r9
560 movd 8(%rax),%xmm5
561 leaq L$inc(%rip),%rax
562 leaq 128(%rdx,%r9,1),%r13
563 shrq $5,%r9
564 movdqa 0(%rax),%xmm0
565 movdqa 16(%rax),%xmm1
566 leaq 88-112(%rsp,%r9,1),%r10
567 leaq 128(%rdx),%r12
568
569 pshufd $0,%xmm5,%xmm5
570 movdqa %xmm1,%xmm4
571.byte 0x67,0x67
572 movdqa %xmm1,%xmm2
573 paddd %xmm0,%xmm1
574 pcmpeqd %xmm5,%xmm0
575.byte 0x67
576 movdqa %xmm4,%xmm3
577 paddd %xmm1,%xmm2
578 pcmpeqd %xmm5,%xmm1
579 movdqa %xmm0,112(%r10)
580 movdqa %xmm4,%xmm0
581
582 paddd %xmm2,%xmm3
583 pcmpeqd %xmm5,%xmm2
584 movdqa %xmm1,128(%r10)
585 movdqa %xmm4,%xmm1
586
587 paddd %xmm3,%xmm0
588 pcmpeqd %xmm5,%xmm3
589 movdqa %xmm2,144(%r10)
590 movdqa %xmm4,%xmm2
591
592 paddd %xmm0,%xmm1
593 pcmpeqd %xmm5,%xmm0
594 movdqa %xmm3,160(%r10)
595 movdqa %xmm4,%xmm3
596 paddd %xmm1,%xmm2
597 pcmpeqd %xmm5,%xmm1
598 movdqa %xmm0,176(%r10)
599 movdqa %xmm4,%xmm0
600
601 paddd %xmm2,%xmm3
602 pcmpeqd %xmm5,%xmm2
603 movdqa %xmm1,192(%r10)
604 movdqa %xmm4,%xmm1
605
606 paddd %xmm3,%xmm0
607 pcmpeqd %xmm5,%xmm3
608 movdqa %xmm2,208(%r10)
609 movdqa %xmm4,%xmm2
610
611 paddd %xmm0,%xmm1
612 pcmpeqd %xmm5,%xmm0
613 movdqa %xmm3,224(%r10)
614 movdqa %xmm4,%xmm3
615 paddd %xmm1,%xmm2
616 pcmpeqd %xmm5,%xmm1
617 movdqa %xmm0,240(%r10)
618 movdqa %xmm4,%xmm0
619
620 paddd %xmm2,%xmm3
621 pcmpeqd %xmm5,%xmm2
622 movdqa %xmm1,256(%r10)
623 movdqa %xmm4,%xmm1
624
625 paddd %xmm3,%xmm0
626 pcmpeqd %xmm5,%xmm3
627 movdqa %xmm2,272(%r10)
628 movdqa %xmm4,%xmm2
629
630 paddd %xmm0,%xmm1
631 pcmpeqd %xmm5,%xmm0
632 movdqa %xmm3,288(%r10)
633 movdqa %xmm4,%xmm3
634 paddd %xmm1,%xmm2
635 pcmpeqd %xmm5,%xmm1
636 movdqa %xmm0,304(%r10)
637
638 paddd %xmm2,%xmm3
639.byte 0x67
640 pcmpeqd %xmm5,%xmm2
641 movdqa %xmm1,320(%r10)
642
643 pcmpeqd %xmm5,%xmm3
644 movdqa %xmm2,336(%r10)
645 pand 64(%r12),%xmm0
646
647 pand 80(%r12),%xmm1
648 pand 96(%r12),%xmm2
649 movdqa %xmm3,352(%r10)
650 pand 112(%r12),%xmm3
651 por %xmm2,%xmm0
652 por %xmm3,%xmm1
653 movdqa -128(%r12),%xmm4
654 movdqa -112(%r12),%xmm5
655 movdqa -96(%r12),%xmm2
656 pand 112(%r10),%xmm4
657 movdqa -80(%r12),%xmm3
658 pand 128(%r10),%xmm5
659 por %xmm4,%xmm0
660 pand 144(%r10),%xmm2
661 por %xmm5,%xmm1
662 pand 160(%r10),%xmm3
663 por %xmm2,%xmm0
664 por %xmm3,%xmm1
665 movdqa -64(%r12),%xmm4
666 movdqa -48(%r12),%xmm5
667 movdqa -32(%r12),%xmm2
668 pand 176(%r10),%xmm4
669 movdqa -16(%r12),%xmm3
670 pand 192(%r10),%xmm5
671 por %xmm4,%xmm0
672 pand 208(%r10),%xmm2
673 por %xmm5,%xmm1
674 pand 224(%r10),%xmm3
675 por %xmm2,%xmm0
676 por %xmm3,%xmm1
677 movdqa 0(%r12),%xmm4
678 movdqa 16(%r12),%xmm5
679 movdqa 32(%r12),%xmm2
680 pand 240(%r10),%xmm4
681 movdqa 48(%r12),%xmm3
682 pand 256(%r10),%xmm5
683 por %xmm4,%xmm0
684 pand 272(%r10),%xmm2
685 por %xmm5,%xmm1
686 pand 288(%r10),%xmm3
687 por %xmm2,%xmm0
688 por %xmm3,%xmm1
689 por %xmm1,%xmm0
690
691 pshufd $0x4e,%xmm0,%xmm1
692 por %xmm1,%xmm0
693 leaq 256(%r12),%r12
694.byte 102,72,15,126,195
695
696 movq %r13,16+8(%rsp)
697 movq %rdi,56+8(%rsp)
698
699 movq (%r8),%r8
700 movq (%rsi),%rax
701 leaq (%rsi,%r9,1),%rsi
702 negq %r9
703
704 movq %r8,%rbp
705 mulq %rbx
706 movq %rax,%r10
707 movq (%rcx),%rax
708
709 imulq %r10,%rbp
710 leaq 64+8(%rsp),%r14
711 movq %rdx,%r11
712
713 mulq %rbp
714 addq %rax,%r10
715 movq 8(%rsi,%r9,1),%rax
716 adcq $0,%rdx
717 movq %rdx,%rdi
718
719 mulq %rbx
720 addq %rax,%r11
721 movq 8(%rcx),%rax
722 adcq $0,%rdx
723 movq %rdx,%r10
724
725 mulq %rbp
726 addq %rax,%rdi
727 movq 16(%rsi,%r9,1),%rax
728 adcq $0,%rdx
729 addq %r11,%rdi
730 leaq 32(%r9),%r15
731 leaq 32(%rcx),%rcx
732 adcq $0,%rdx
733 movq %rdi,(%r14)
734 movq %rdx,%r13
735 jmp L$1st4x
736
737.p2align 5
738L$1st4x:
739 mulq %rbx
740 addq %rax,%r10
741 movq -16(%rcx),%rax
742 leaq 32(%r14),%r14
743 adcq $0,%rdx
744 movq %rdx,%r11
745
746 mulq %rbp
747 addq %rax,%r13
748 movq -8(%rsi,%r15,1),%rax
749 adcq $0,%rdx
750 addq %r10,%r13
751 adcq $0,%rdx
752 movq %r13,-24(%r14)
753 movq %rdx,%rdi
754
755 mulq %rbx
756 addq %rax,%r11
757 movq -8(%rcx),%rax
758 adcq $0,%rdx
759 movq %rdx,%r10
760
761 mulq %rbp
762 addq %rax,%rdi
763 movq (%rsi,%r15,1),%rax
764 adcq $0,%rdx
765 addq %r11,%rdi
766 adcq $0,%rdx
767 movq %rdi,-16(%r14)
768 movq %rdx,%r13
769
770 mulq %rbx
771 addq %rax,%r10
772 movq 0(%rcx),%rax
773 adcq $0,%rdx
774 movq %rdx,%r11
775
776 mulq %rbp
777 addq %rax,%r13
778 movq 8(%rsi,%r15,1),%rax
779 adcq $0,%rdx
780 addq %r10,%r13
781 adcq $0,%rdx
782 movq %r13,-8(%r14)
783 movq %rdx,%rdi
784
785 mulq %rbx
786 addq %rax,%r11
787 movq 8(%rcx),%rax
788 adcq $0,%rdx
789 movq %rdx,%r10
790
791 mulq %rbp
792 addq %rax,%rdi
793 movq 16(%rsi,%r15,1),%rax
794 adcq $0,%rdx
795 addq %r11,%rdi
796 leaq 32(%rcx),%rcx
797 adcq $0,%rdx
798 movq %rdi,(%r14)
799 movq %rdx,%r13
800
801 addq $32,%r15
802 jnz L$1st4x
803
804 mulq %rbx
805 addq %rax,%r10
806 movq -16(%rcx),%rax
807 leaq 32(%r14),%r14
808 adcq $0,%rdx
809 movq %rdx,%r11
810
811 mulq %rbp
812 addq %rax,%r13
813 movq -8(%rsi),%rax
814 adcq $0,%rdx
815 addq %r10,%r13
816 adcq $0,%rdx
817 movq %r13,-24(%r14)
818 movq %rdx,%rdi
819
820 mulq %rbx
821 addq %rax,%r11
822 movq -8(%rcx),%rax
823 adcq $0,%rdx
824 movq %rdx,%r10
825
826 mulq %rbp
827 addq %rax,%rdi
828 movq (%rsi,%r9,1),%rax
829 adcq $0,%rdx
830 addq %r11,%rdi
831 adcq $0,%rdx
832 movq %rdi,-16(%r14)
833 movq %rdx,%r13
834
835 leaq (%rcx,%r9,1),%rcx
836
837 xorq %rdi,%rdi
838 addq %r10,%r13
839 adcq $0,%rdi
840 movq %r13,-8(%r14)
841
842 jmp L$outer4x
843
844.p2align 5
845L$outer4x:
846 leaq 16+128(%r14),%rdx
847 pxor %xmm4,%xmm4
848 pxor %xmm5,%xmm5
849 movdqa -128(%r12),%xmm0
850 movdqa -112(%r12),%xmm1
851 movdqa -96(%r12),%xmm2
852 movdqa -80(%r12),%xmm3
853 pand -128(%rdx),%xmm0
854 pand -112(%rdx),%xmm1
855 por %xmm0,%xmm4
856 pand -96(%rdx),%xmm2
857 por %xmm1,%xmm5
858 pand -80(%rdx),%xmm3
859 por %xmm2,%xmm4
860 por %xmm3,%xmm5
861 movdqa -64(%r12),%xmm0
862 movdqa -48(%r12),%xmm1
863 movdqa -32(%r12),%xmm2
864 movdqa -16(%r12),%xmm3
865 pand -64(%rdx),%xmm0
866 pand -48(%rdx),%xmm1
867 por %xmm0,%xmm4
868 pand -32(%rdx),%xmm2
869 por %xmm1,%xmm5
870 pand -16(%rdx),%xmm3
871 por %xmm2,%xmm4
872 por %xmm3,%xmm5
873 movdqa 0(%r12),%xmm0
874 movdqa 16(%r12),%xmm1
875 movdqa 32(%r12),%xmm2
876 movdqa 48(%r12),%xmm3
877 pand 0(%rdx),%xmm0
878 pand 16(%rdx),%xmm1
879 por %xmm0,%xmm4
880 pand 32(%rdx),%xmm2
881 por %xmm1,%xmm5
882 pand 48(%rdx),%xmm3
883 por %xmm2,%xmm4
884 por %xmm3,%xmm5
885 movdqa 64(%r12),%xmm0
886 movdqa 80(%r12),%xmm1
887 movdqa 96(%r12),%xmm2
888 movdqa 112(%r12),%xmm3
889 pand 64(%rdx),%xmm0
890 pand 80(%rdx),%xmm1
891 por %xmm0,%xmm4
892 pand 96(%rdx),%xmm2
893 por %xmm1,%xmm5
894 pand 112(%rdx),%xmm3
895 por %xmm2,%xmm4
896 por %xmm3,%xmm5
897 por %xmm5,%xmm4
898
899 pshufd $0x4e,%xmm4,%xmm0
900 por %xmm4,%xmm0
901 leaq 256(%r12),%r12
902.byte 102,72,15,126,195
903
904 movq (%r14,%r9,1),%r10
905 movq %r8,%rbp
906 mulq %rbx
907 addq %rax,%r10
908 movq (%rcx),%rax
909 adcq $0,%rdx
910
911 imulq %r10,%rbp
912 movq %rdx,%r11
913 movq %rdi,(%r14)
914
915 leaq (%r14,%r9,1),%r14
916
917 mulq %rbp
918 addq %rax,%r10
919 movq 8(%rsi,%r9,1),%rax
920 adcq $0,%rdx
921 movq %rdx,%rdi
922
923 mulq %rbx
924 addq %rax,%r11
925 movq 8(%rcx),%rax
926 adcq $0,%rdx
927 addq 8(%r14),%r11
928 adcq $0,%rdx
929 movq %rdx,%r10
930
931 mulq %rbp
932 addq %rax,%rdi
933 movq 16(%rsi,%r9,1),%rax
934 adcq $0,%rdx
935 addq %r11,%rdi
936 leaq 32(%r9),%r15
937 leaq 32(%rcx),%rcx
938 adcq $0,%rdx
939 movq %rdx,%r13
940 jmp L$inner4x
941
942.p2align 5
943L$inner4x:
944 mulq %rbx
945 addq %rax,%r10
946 movq -16(%rcx),%rax
947 adcq $0,%rdx
948 addq 16(%r14),%r10
949 leaq 32(%r14),%r14
950 adcq $0,%rdx
951 movq %rdx,%r11
952
953 mulq %rbp
954 addq %rax,%r13
955 movq -8(%rsi,%r15,1),%rax
956 adcq $0,%rdx
957 addq %r10,%r13
958 adcq $0,%rdx
959 movq %rdi,-32(%r14)
960 movq %rdx,%rdi
961
962 mulq %rbx
963 addq %rax,%r11
964 movq -8(%rcx),%rax
965 adcq $0,%rdx
966 addq -8(%r14),%r11
967 adcq $0,%rdx
968 movq %rdx,%r10
969
970 mulq %rbp
971 addq %rax,%rdi
972 movq (%rsi,%r15,1),%rax
973 adcq $0,%rdx
974 addq %r11,%rdi
975 adcq $0,%rdx
976 movq %r13,-24(%r14)
977 movq %rdx,%r13
978
979 mulq %rbx
980 addq %rax,%r10
981 movq 0(%rcx),%rax
982 adcq $0,%rdx
983 addq (%r14),%r10
984 adcq $0,%rdx
985 movq %rdx,%r11
986
987 mulq %rbp
988 addq %rax,%r13
989 movq 8(%rsi,%r15,1),%rax
990 adcq $0,%rdx
991 addq %r10,%r13
992 adcq $0,%rdx
993 movq %rdi,-16(%r14)
994 movq %rdx,%rdi
995
996 mulq %rbx
997 addq %rax,%r11
998 movq 8(%rcx),%rax
999 adcq $0,%rdx
1000 addq 8(%r14),%r11
1001 adcq $0,%rdx
1002 movq %rdx,%r10
1003
1004 mulq %rbp
1005 addq %rax,%rdi
1006 movq 16(%rsi,%r15,1),%rax
1007 adcq $0,%rdx
1008 addq %r11,%rdi
1009 leaq 32(%rcx),%rcx
1010 adcq $0,%rdx
1011 movq %r13,-8(%r14)
1012 movq %rdx,%r13
1013
1014 addq $32,%r15
1015 jnz L$inner4x
1016
1017 mulq %rbx
1018 addq %rax,%r10
1019 movq -16(%rcx),%rax
1020 adcq $0,%rdx
1021 addq 16(%r14),%r10
1022 leaq 32(%r14),%r14
1023 adcq $0,%rdx
1024 movq %rdx,%r11
1025
1026 mulq %rbp
1027 addq %rax,%r13
1028 movq -8(%rsi),%rax
1029 adcq $0,%rdx
1030 addq %r10,%r13
1031 adcq $0,%rdx
1032 movq %rdi,-32(%r14)
1033 movq %rdx,%rdi
1034
1035 mulq %rbx
1036 addq %rax,%r11
1037 movq %rbp,%rax
1038 movq -8(%rcx),%rbp
1039 adcq $0,%rdx
1040 addq -8(%r14),%r11
1041 adcq $0,%rdx
1042 movq %rdx,%r10
1043
1044 mulq %rbp
1045 addq %rax,%rdi
1046 movq (%rsi,%r9,1),%rax
1047 adcq $0,%rdx
1048 addq %r11,%rdi
1049 adcq $0,%rdx
1050 movq %r13,-24(%r14)
1051 movq %rdx,%r13
1052
1053 movq %rdi,-16(%r14)
1054 leaq (%rcx,%r9,1),%rcx
1055
1056 xorq %rdi,%rdi
1057 addq %r10,%r13
1058 adcq $0,%rdi
1059 addq (%r14),%r13
1060 adcq $0,%rdi
1061 movq %r13,-8(%r14)
1062
1063 cmpq 16+8(%rsp),%r12
1064 jb L$outer4x
1065 xorq %rax,%rax
1066 subq %r13,%rbp
1067 adcq %r15,%r15
1068 orq %r15,%rdi
1069 subq %rdi,%rax
1070 leaq (%r14,%r9,1),%rbx
1071 movq (%rcx),%r12
1072 leaq (%rcx),%rbp
1073 movq %r9,%rcx
1074 sarq $3+2,%rcx
1075 movq 56+8(%rsp),%rdi
1076 decq %r12
1077 xorq %r10,%r10
1078 movq 8(%rbp),%r13
1079 movq 16(%rbp),%r14
1080 movq 24(%rbp),%r15
1081 jmp L$sqr4x_sub_entry
1082
1083
David Benjamin3efe2eb2024-05-08 22:24:27 -07001084.globl _bn_power5_nohw
1085.private_extern _bn_power5_nohw
David Benjaminfe0c91e2024-03-18 15:37:24 +10001086
1087.p2align 5
David Benjamin3efe2eb2024-05-08 22:24:27 -07001088_bn_power5_nohw:
David Benjaminfe0c91e2024-03-18 15:37:24 +10001089
1090_CET_ENDBR
1091 movq %rsp,%rax
1092
David Benjaminfe0c91e2024-03-18 15:37:24 +10001093 pushq %rbx
1094
1095 pushq %rbp
1096
1097 pushq %r12
1098
1099 pushq %r13
1100
1101 pushq %r14
1102
1103 pushq %r15
1104
1105L$power5_prologue:
1106
David Benjamin3efe2eb2024-05-08 22:24:27 -07001107
1108
1109
David Benjaminfe0c91e2024-03-18 15:37:24 +10001110 shll $3,%r9d
1111 leal (%r9,%r9,2),%r10d
1112 negq %r9
1113 movq (%r8),%r8
1114
1115
1116
1117
1118
1119
1120
1121
1122 leaq -320(%rsp,%r9,2),%r11
1123 movq %rsp,%rbp
1124 subq %rdi,%r11
1125 andq $4095,%r11
1126 cmpq %r11,%r10
1127 jb L$pwr_sp_alt
1128 subq %r11,%rbp
1129 leaq -320(%rbp,%r9,2),%rbp
1130 jmp L$pwr_sp_done
1131
1132.p2align 5
1133L$pwr_sp_alt:
1134 leaq 4096-320(,%r9,2),%r10
1135 leaq -320(%rbp,%r9,2),%rbp
1136 subq %r10,%r11
1137 movq $0,%r10
1138 cmovcq %r10,%r11
1139 subq %r11,%rbp
1140L$pwr_sp_done:
1141 andq $-64,%rbp
1142 movq %rsp,%r11
1143 subq %rbp,%r11
1144 andq $-4096,%r11
1145 leaq (%r11,%rbp,1),%rsp
1146 movq (%rsp),%r10
1147 cmpq %rbp,%rsp
1148 ja L$pwr_page_walk
1149 jmp L$pwr_page_walk_done
1150
1151L$pwr_page_walk:
1152 leaq -4096(%rsp),%rsp
1153 movq (%rsp),%r10
1154 cmpq %rbp,%rsp
1155 ja L$pwr_page_walk
1156L$pwr_page_walk_done:
1157
1158 movq %r9,%r10
1159 negq %r9
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170 movq %r8,32(%rsp)
1171 movq %rax,40(%rsp)
1172
1173L$power5_body:
1174.byte 102,72,15,110,207
1175.byte 102,72,15,110,209
1176.byte 102,73,15,110,218
1177.byte 102,72,15,110,226
1178
1179 call __bn_sqr8x_internal
1180 call __bn_post4x_internal
1181 call __bn_sqr8x_internal
1182 call __bn_post4x_internal
1183 call __bn_sqr8x_internal
1184 call __bn_post4x_internal
1185 call __bn_sqr8x_internal
1186 call __bn_post4x_internal
1187 call __bn_sqr8x_internal
1188 call __bn_post4x_internal
1189
1190.byte 102,72,15,126,209
1191.byte 102,72,15,126,226
1192 movq %rsi,%rdi
1193 movq 40(%rsp),%rax
1194 leaq 32(%rsp),%r8
1195
1196 call mul4x_internal
1197
1198 movq 40(%rsp),%rsi
1199
1200 movq $1,%rax
1201 movq -48(%rsi),%r15
1202
1203 movq -40(%rsi),%r14
1204
1205 movq -32(%rsi),%r13
1206
1207 movq -24(%rsi),%r12
1208
1209 movq -16(%rsi),%rbp
1210
1211 movq -8(%rsi),%rbx
1212
1213 leaq (%rsi),%rsp
1214
1215L$power5_epilogue:
1216 ret
1217
1218
1219
1220.globl _bn_sqr8x_internal
1221.private_extern _bn_sqr8x_internal
1222.private_extern _bn_sqr8x_internal
1223
1224.p2align 5
1225_bn_sqr8x_internal:
1226__bn_sqr8x_internal:
1227
1228_CET_ENDBR
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302 leaq 32(%r10),%rbp
1303 leaq (%rsi,%r9,1),%rsi
1304
1305 movq %r9,%rcx
1306
1307
1308 movq -32(%rsi,%rbp,1),%r14
1309 leaq 48+8(%rsp,%r9,2),%rdi
1310 movq -24(%rsi,%rbp,1),%rax
1311 leaq -32(%rdi,%rbp,1),%rdi
1312 movq -16(%rsi,%rbp,1),%rbx
1313 movq %rax,%r15
1314
1315 mulq %r14
1316 movq %rax,%r10
1317 movq %rbx,%rax
1318 movq %rdx,%r11
1319 movq %r10,-24(%rdi,%rbp,1)
1320
1321 mulq %r14
1322 addq %rax,%r11
1323 movq %rbx,%rax
1324 adcq $0,%rdx
1325 movq %r11,-16(%rdi,%rbp,1)
1326 movq %rdx,%r10
1327
1328
1329 movq -8(%rsi,%rbp,1),%rbx
1330 mulq %r15
1331 movq %rax,%r12
1332 movq %rbx,%rax
1333 movq %rdx,%r13
1334
1335 leaq (%rbp),%rcx
1336 mulq %r14
1337 addq %rax,%r10
1338 movq %rbx,%rax
1339 movq %rdx,%r11
1340 adcq $0,%r11
1341 addq %r12,%r10
1342 adcq $0,%r11
1343 movq %r10,-8(%rdi,%rcx,1)
1344 jmp L$sqr4x_1st
1345
1346.p2align 5
1347L$sqr4x_1st:
1348 movq (%rsi,%rcx,1),%rbx
1349 mulq %r15
1350 addq %rax,%r13
1351 movq %rbx,%rax
1352 movq %rdx,%r12
1353 adcq $0,%r12
1354
1355 mulq %r14
1356 addq %rax,%r11
1357 movq %rbx,%rax
1358 movq 8(%rsi,%rcx,1),%rbx
1359 movq %rdx,%r10
1360 adcq $0,%r10
1361 addq %r13,%r11
1362 adcq $0,%r10
1363
1364
1365 mulq %r15
1366 addq %rax,%r12
1367 movq %rbx,%rax
1368 movq %r11,(%rdi,%rcx,1)
1369 movq %rdx,%r13
1370 adcq $0,%r13
1371
1372 mulq %r14
1373 addq %rax,%r10
1374 movq %rbx,%rax
1375 movq 16(%rsi,%rcx,1),%rbx
1376 movq %rdx,%r11
1377 adcq $0,%r11
1378 addq %r12,%r10
1379 adcq $0,%r11
1380
1381 mulq %r15
1382 addq %rax,%r13
1383 movq %rbx,%rax
1384 movq %r10,8(%rdi,%rcx,1)
1385 movq %rdx,%r12
1386 adcq $0,%r12
1387
1388 mulq %r14
1389 addq %rax,%r11
1390 movq %rbx,%rax
1391 movq 24(%rsi,%rcx,1),%rbx
1392 movq %rdx,%r10
1393 adcq $0,%r10
1394 addq %r13,%r11
1395 adcq $0,%r10
1396
1397
1398 mulq %r15
1399 addq %rax,%r12
1400 movq %rbx,%rax
1401 movq %r11,16(%rdi,%rcx,1)
1402 movq %rdx,%r13
1403 adcq $0,%r13
1404 leaq 32(%rcx),%rcx
1405
1406 mulq %r14
1407 addq %rax,%r10
1408 movq %rbx,%rax
1409 movq %rdx,%r11
1410 adcq $0,%r11
1411 addq %r12,%r10
1412 adcq $0,%r11
1413 movq %r10,-8(%rdi,%rcx,1)
1414
1415 cmpq $0,%rcx
1416 jne L$sqr4x_1st
1417
1418 mulq %r15
1419 addq %rax,%r13
1420 leaq 16(%rbp),%rbp
1421 adcq $0,%rdx
1422 addq %r11,%r13
1423 adcq $0,%rdx
1424
1425 movq %r13,(%rdi)
1426 movq %rdx,%r12
1427 movq %rdx,8(%rdi)
1428 jmp L$sqr4x_outer
1429
1430.p2align 5
1431L$sqr4x_outer:
1432 movq -32(%rsi,%rbp,1),%r14
1433 leaq 48+8(%rsp,%r9,2),%rdi
1434 movq -24(%rsi,%rbp,1),%rax
1435 leaq -32(%rdi,%rbp,1),%rdi
1436 movq -16(%rsi,%rbp,1),%rbx
1437 movq %rax,%r15
1438
1439 mulq %r14
1440 movq -24(%rdi,%rbp,1),%r10
1441 addq %rax,%r10
1442 movq %rbx,%rax
1443 adcq $0,%rdx
1444 movq %r10,-24(%rdi,%rbp,1)
1445 movq %rdx,%r11
1446
1447 mulq %r14
1448 addq %rax,%r11
1449 movq %rbx,%rax
1450 adcq $0,%rdx
1451 addq -16(%rdi,%rbp,1),%r11
1452 movq %rdx,%r10
1453 adcq $0,%r10
1454 movq %r11,-16(%rdi,%rbp,1)
1455
1456 xorq %r12,%r12
1457
1458 movq -8(%rsi,%rbp,1),%rbx
1459 mulq %r15
1460 addq %rax,%r12
1461 movq %rbx,%rax
1462 adcq $0,%rdx
1463 addq -8(%rdi,%rbp,1),%r12
1464 movq %rdx,%r13
1465 adcq $0,%r13
1466
1467 mulq %r14
1468 addq %rax,%r10
1469 movq %rbx,%rax
1470 adcq $0,%rdx
1471 addq %r12,%r10
1472 movq %rdx,%r11
1473 adcq $0,%r11
1474 movq %r10,-8(%rdi,%rbp,1)
1475
1476 leaq (%rbp),%rcx
1477 jmp L$sqr4x_inner
1478
1479.p2align 5
1480L$sqr4x_inner:
1481 movq (%rsi,%rcx,1),%rbx
1482 mulq %r15
1483 addq %rax,%r13
1484 movq %rbx,%rax
1485 movq %rdx,%r12
1486 adcq $0,%r12
1487 addq (%rdi,%rcx,1),%r13
1488 adcq $0,%r12
1489
1490.byte 0x67
1491 mulq %r14
1492 addq %rax,%r11
1493 movq %rbx,%rax
1494 movq 8(%rsi,%rcx,1),%rbx
1495 movq %rdx,%r10
1496 adcq $0,%r10
1497 addq %r13,%r11
1498 adcq $0,%r10
1499
1500 mulq %r15
1501 addq %rax,%r12
1502 movq %r11,(%rdi,%rcx,1)
1503 movq %rbx,%rax
1504 movq %rdx,%r13
1505 adcq $0,%r13
1506 addq 8(%rdi,%rcx,1),%r12
1507 leaq 16(%rcx),%rcx
1508 adcq $0,%r13
1509
1510 mulq %r14
1511 addq %rax,%r10
1512 movq %rbx,%rax
1513 adcq $0,%rdx
1514 addq %r12,%r10
1515 movq %rdx,%r11
1516 adcq $0,%r11
1517 movq %r10,-8(%rdi,%rcx,1)
1518
1519 cmpq $0,%rcx
1520 jne L$sqr4x_inner
1521
1522.byte 0x67
1523 mulq %r15
1524 addq %rax,%r13
1525 adcq $0,%rdx
1526 addq %r11,%r13
1527 adcq $0,%rdx
1528
1529 movq %r13,(%rdi)
1530 movq %rdx,%r12
1531 movq %rdx,8(%rdi)
1532
1533 addq $16,%rbp
1534 jnz L$sqr4x_outer
1535
1536
1537 movq -32(%rsi),%r14
1538 leaq 48+8(%rsp,%r9,2),%rdi
1539 movq -24(%rsi),%rax
1540 leaq -32(%rdi,%rbp,1),%rdi
1541 movq -16(%rsi),%rbx
1542 movq %rax,%r15
1543
1544 mulq %r14
1545 addq %rax,%r10
1546 movq %rbx,%rax
1547 movq %rdx,%r11
1548 adcq $0,%r11
1549
1550 mulq %r14
1551 addq %rax,%r11
1552 movq %rbx,%rax
1553 movq %r10,-24(%rdi)
1554 movq %rdx,%r10
1555 adcq $0,%r10
1556 addq %r13,%r11
1557 movq -8(%rsi),%rbx
1558 adcq $0,%r10
1559
1560 mulq %r15
1561 addq %rax,%r12
1562 movq %rbx,%rax
1563 movq %r11,-16(%rdi)
1564 movq %rdx,%r13
1565 adcq $0,%r13
1566
1567 mulq %r14
1568 addq %rax,%r10
1569 movq %rbx,%rax
1570 movq %rdx,%r11
1571 adcq $0,%r11
1572 addq %r12,%r10
1573 adcq $0,%r11
1574 movq %r10,-8(%rdi)
1575
1576 mulq %r15
1577 addq %rax,%r13
1578 movq -16(%rsi),%rax
1579 adcq $0,%rdx
1580 addq %r11,%r13
1581 adcq $0,%rdx
1582
1583 movq %r13,(%rdi)
1584 movq %rdx,%r12
1585 movq %rdx,8(%rdi)
1586
1587 mulq %rbx
1588 addq $16,%rbp
1589 xorq %r14,%r14
1590 subq %r9,%rbp
1591 xorq %r15,%r15
1592
1593 addq %r12,%rax
1594 adcq $0,%rdx
1595 movq %rax,8(%rdi)
1596 movq %rdx,16(%rdi)
1597 movq %r15,24(%rdi)
1598
1599 movq -16(%rsi,%rbp,1),%rax
1600 leaq 48+8(%rsp),%rdi
1601 xorq %r10,%r10
1602 movq 8(%rdi),%r11
1603
1604 leaq (%r14,%r10,2),%r12
1605 shrq $63,%r10
1606 leaq (%rcx,%r11,2),%r13
1607 shrq $63,%r11
1608 orq %r10,%r13
1609 movq 16(%rdi),%r10
1610 movq %r11,%r14
1611 mulq %rax
1612 negq %r15
1613 movq 24(%rdi),%r11
1614 adcq %rax,%r12
1615 movq -8(%rsi,%rbp,1),%rax
1616 movq %r12,(%rdi)
1617 adcq %rdx,%r13
1618
1619 leaq (%r14,%r10,2),%rbx
1620 movq %r13,8(%rdi)
1621 sbbq %r15,%r15
1622 shrq $63,%r10
1623 leaq (%rcx,%r11,2),%r8
1624 shrq $63,%r11
1625 orq %r10,%r8
1626 movq 32(%rdi),%r10
1627 movq %r11,%r14
1628 mulq %rax
1629 negq %r15
1630 movq 40(%rdi),%r11
1631 adcq %rax,%rbx
1632 movq 0(%rsi,%rbp,1),%rax
1633 movq %rbx,16(%rdi)
1634 adcq %rdx,%r8
1635 leaq 16(%rbp),%rbp
1636 movq %r8,24(%rdi)
1637 sbbq %r15,%r15
1638 leaq 64(%rdi),%rdi
1639 jmp L$sqr4x_shift_n_add
1640
1641.p2align 5
1642L$sqr4x_shift_n_add:
1643 leaq (%r14,%r10,2),%r12
1644 shrq $63,%r10
1645 leaq (%rcx,%r11,2),%r13
1646 shrq $63,%r11
1647 orq %r10,%r13
1648 movq -16(%rdi),%r10
1649 movq %r11,%r14
1650 mulq %rax
1651 negq %r15
1652 movq -8(%rdi),%r11
1653 adcq %rax,%r12
1654 movq -8(%rsi,%rbp,1),%rax
1655 movq %r12,-32(%rdi)
1656 adcq %rdx,%r13
1657
1658 leaq (%r14,%r10,2),%rbx
1659 movq %r13,-24(%rdi)
1660 sbbq %r15,%r15
1661 shrq $63,%r10
1662 leaq (%rcx,%r11,2),%r8
1663 shrq $63,%r11
1664 orq %r10,%r8
1665 movq 0(%rdi),%r10
1666 movq %r11,%r14
1667 mulq %rax
1668 negq %r15
1669 movq 8(%rdi),%r11
1670 adcq %rax,%rbx
1671 movq 0(%rsi,%rbp,1),%rax
1672 movq %rbx,-16(%rdi)
1673 adcq %rdx,%r8
1674
1675 leaq (%r14,%r10,2),%r12
1676 movq %r8,-8(%rdi)
1677 sbbq %r15,%r15
1678 shrq $63,%r10
1679 leaq (%rcx,%r11,2),%r13
1680 shrq $63,%r11
1681 orq %r10,%r13
1682 movq 16(%rdi),%r10
1683 movq %r11,%r14
1684 mulq %rax
1685 negq %r15
1686 movq 24(%rdi),%r11
1687 adcq %rax,%r12
1688 movq 8(%rsi,%rbp,1),%rax
1689 movq %r12,0(%rdi)
1690 adcq %rdx,%r13
1691
1692 leaq (%r14,%r10,2),%rbx
1693 movq %r13,8(%rdi)
1694 sbbq %r15,%r15
1695 shrq $63,%r10
1696 leaq (%rcx,%r11,2),%r8
1697 shrq $63,%r11
1698 orq %r10,%r8
1699 movq 32(%rdi),%r10
1700 movq %r11,%r14
1701 mulq %rax
1702 negq %r15
1703 movq 40(%rdi),%r11
1704 adcq %rax,%rbx
1705 movq 16(%rsi,%rbp,1),%rax
1706 movq %rbx,16(%rdi)
1707 adcq %rdx,%r8
1708 movq %r8,24(%rdi)
1709 sbbq %r15,%r15
1710 leaq 64(%rdi),%rdi
1711 addq $32,%rbp
1712 jnz L$sqr4x_shift_n_add
1713
1714 leaq (%r14,%r10,2),%r12
1715.byte 0x67
1716 shrq $63,%r10
1717 leaq (%rcx,%r11,2),%r13
1718 shrq $63,%r11
1719 orq %r10,%r13
1720 movq -16(%rdi),%r10
1721 movq %r11,%r14
1722 mulq %rax
1723 negq %r15
1724 movq -8(%rdi),%r11
1725 adcq %rax,%r12
1726 movq -8(%rsi),%rax
1727 movq %r12,-32(%rdi)
1728 adcq %rdx,%r13
1729
1730 leaq (%r14,%r10,2),%rbx
1731 movq %r13,-24(%rdi)
1732 sbbq %r15,%r15
1733 shrq $63,%r10
1734 leaq (%rcx,%r11,2),%r8
1735 shrq $63,%r11
1736 orq %r10,%r8
1737 mulq %rax
1738 negq %r15
1739 adcq %rax,%rbx
1740 adcq %rdx,%r8
1741 movq %rbx,-16(%rdi)
1742 movq %r8,-8(%rdi)
1743.byte 102,72,15,126,213
1744__bn_sqr8x_reduction:
1745 xorq %rax,%rax
1746 leaq (%r9,%rbp,1),%rcx
1747 leaq 48+8(%rsp,%r9,2),%rdx
1748 movq %rcx,0+8(%rsp)
1749 leaq 48+8(%rsp,%r9,1),%rdi
1750 movq %rdx,8+8(%rsp)
1751 negq %r9
1752 jmp L$8x_reduction_loop
1753
1754.p2align 5
1755L$8x_reduction_loop:
1756 leaq (%rdi,%r9,1),%rdi
1757.byte 0x66
1758 movq 0(%rdi),%rbx
1759 movq 8(%rdi),%r9
1760 movq 16(%rdi),%r10
1761 movq 24(%rdi),%r11
1762 movq 32(%rdi),%r12
1763 movq 40(%rdi),%r13
1764 movq 48(%rdi),%r14
1765 movq 56(%rdi),%r15
1766 movq %rax,(%rdx)
1767 leaq 64(%rdi),%rdi
1768
1769.byte 0x67
1770 movq %rbx,%r8
1771 imulq 32+8(%rsp),%rbx
1772 movq 0(%rbp),%rax
1773 movl $8,%ecx
1774 jmp L$8x_reduce
1775
1776.p2align 5
1777L$8x_reduce:
1778 mulq %rbx
1779 movq 8(%rbp),%rax
1780 negq %r8
1781 movq %rdx,%r8
1782 adcq $0,%r8
1783
1784 mulq %rbx
1785 addq %rax,%r9
1786 movq 16(%rbp),%rax
1787 adcq $0,%rdx
1788 addq %r9,%r8
1789 movq %rbx,48-8+8(%rsp,%rcx,8)
1790 movq %rdx,%r9
1791 adcq $0,%r9
1792
1793 mulq %rbx
1794 addq %rax,%r10
1795 movq 24(%rbp),%rax
1796 adcq $0,%rdx
1797 addq %r10,%r9
1798 movq 32+8(%rsp),%rsi
1799 movq %rdx,%r10
1800 adcq $0,%r10
1801
1802 mulq %rbx
1803 addq %rax,%r11
1804 movq 32(%rbp),%rax
1805 adcq $0,%rdx
1806 imulq %r8,%rsi
1807 addq %r11,%r10
1808 movq %rdx,%r11
1809 adcq $0,%r11
1810
1811 mulq %rbx
1812 addq %rax,%r12
1813 movq 40(%rbp),%rax
1814 adcq $0,%rdx
1815 addq %r12,%r11
1816 movq %rdx,%r12
1817 adcq $0,%r12
1818
1819 mulq %rbx
1820 addq %rax,%r13
1821 movq 48(%rbp),%rax
1822 adcq $0,%rdx
1823 addq %r13,%r12
1824 movq %rdx,%r13
1825 adcq $0,%r13
1826
1827 mulq %rbx
1828 addq %rax,%r14
1829 movq 56(%rbp),%rax
1830 adcq $0,%rdx
1831 addq %r14,%r13
1832 movq %rdx,%r14
1833 adcq $0,%r14
1834
1835 mulq %rbx
1836 movq %rsi,%rbx
1837 addq %rax,%r15
1838 movq 0(%rbp),%rax
1839 adcq $0,%rdx
1840 addq %r15,%r14
1841 movq %rdx,%r15
1842 adcq $0,%r15
1843
1844 decl %ecx
1845 jnz L$8x_reduce
1846
1847 leaq 64(%rbp),%rbp
1848 xorq %rax,%rax
1849 movq 8+8(%rsp),%rdx
1850 cmpq 0+8(%rsp),%rbp
1851 jae L$8x_no_tail
1852
1853.byte 0x66
1854 addq 0(%rdi),%r8
1855 adcq 8(%rdi),%r9
1856 adcq 16(%rdi),%r10
1857 adcq 24(%rdi),%r11
1858 adcq 32(%rdi),%r12
1859 adcq 40(%rdi),%r13
1860 adcq 48(%rdi),%r14
1861 adcq 56(%rdi),%r15
1862 sbbq %rsi,%rsi
1863
1864 movq 48+56+8(%rsp),%rbx
1865 movl $8,%ecx
1866 movq 0(%rbp),%rax
1867 jmp L$8x_tail
1868
1869.p2align 5
1870L$8x_tail:
1871 mulq %rbx
1872 addq %rax,%r8
1873 movq 8(%rbp),%rax
1874 movq %r8,(%rdi)
1875 movq %rdx,%r8
1876 adcq $0,%r8
1877
1878 mulq %rbx
1879 addq %rax,%r9
1880 movq 16(%rbp),%rax
1881 adcq $0,%rdx
1882 addq %r9,%r8
1883 leaq 8(%rdi),%rdi
1884 movq %rdx,%r9
1885 adcq $0,%r9
1886
1887 mulq %rbx
1888 addq %rax,%r10
1889 movq 24(%rbp),%rax
1890 adcq $0,%rdx
1891 addq %r10,%r9
1892 movq %rdx,%r10
1893 adcq $0,%r10
1894
1895 mulq %rbx
1896 addq %rax,%r11
1897 movq 32(%rbp),%rax
1898 adcq $0,%rdx
1899 addq %r11,%r10
1900 movq %rdx,%r11
1901 adcq $0,%r11
1902
1903 mulq %rbx
1904 addq %rax,%r12
1905 movq 40(%rbp),%rax
1906 adcq $0,%rdx
1907 addq %r12,%r11
1908 movq %rdx,%r12
1909 adcq $0,%r12
1910
1911 mulq %rbx
1912 addq %rax,%r13
1913 movq 48(%rbp),%rax
1914 adcq $0,%rdx
1915 addq %r13,%r12
1916 movq %rdx,%r13
1917 adcq $0,%r13
1918
1919 mulq %rbx
1920 addq %rax,%r14
1921 movq 56(%rbp),%rax
1922 adcq $0,%rdx
1923 addq %r14,%r13
1924 movq %rdx,%r14
1925 adcq $0,%r14
1926
1927 mulq %rbx
1928 movq 48-16+8(%rsp,%rcx,8),%rbx
1929 addq %rax,%r15
1930 adcq $0,%rdx
1931 addq %r15,%r14
1932 movq 0(%rbp),%rax
1933 movq %rdx,%r15
1934 adcq $0,%r15
1935
1936 decl %ecx
1937 jnz L$8x_tail
1938
1939 leaq 64(%rbp),%rbp
1940 movq 8+8(%rsp),%rdx
1941 cmpq 0+8(%rsp),%rbp
1942 jae L$8x_tail_done
1943
1944 movq 48+56+8(%rsp),%rbx
1945 negq %rsi
1946 movq 0(%rbp),%rax
1947 adcq 0(%rdi),%r8
1948 adcq 8(%rdi),%r9
1949 adcq 16(%rdi),%r10
1950 adcq 24(%rdi),%r11
1951 adcq 32(%rdi),%r12
1952 adcq 40(%rdi),%r13
1953 adcq 48(%rdi),%r14
1954 adcq 56(%rdi),%r15
1955 sbbq %rsi,%rsi
1956
1957 movl $8,%ecx
1958 jmp L$8x_tail
1959
1960.p2align 5
1961L$8x_tail_done:
1962 xorq %rax,%rax
1963 addq (%rdx),%r8
1964 adcq $0,%r9
1965 adcq $0,%r10
1966 adcq $0,%r11
1967 adcq $0,%r12
1968 adcq $0,%r13
1969 adcq $0,%r14
1970 adcq $0,%r15
1971 adcq $0,%rax
1972
1973 negq %rsi
1974L$8x_no_tail:
1975 adcq 0(%rdi),%r8
1976 adcq 8(%rdi),%r9
1977 adcq 16(%rdi),%r10
1978 adcq 24(%rdi),%r11
1979 adcq 32(%rdi),%r12
1980 adcq 40(%rdi),%r13
1981 adcq 48(%rdi),%r14
1982 adcq 56(%rdi),%r15
1983 adcq $0,%rax
1984 movq -8(%rbp),%rcx
1985 xorq %rsi,%rsi
1986
1987.byte 102,72,15,126,213
1988
1989 movq %r8,0(%rdi)
1990 movq %r9,8(%rdi)
1991.byte 102,73,15,126,217
1992 movq %r10,16(%rdi)
1993 movq %r11,24(%rdi)
1994 movq %r12,32(%rdi)
1995 movq %r13,40(%rdi)
1996 movq %r14,48(%rdi)
1997 movq %r15,56(%rdi)
1998 leaq 64(%rdi),%rdi
1999
2000 cmpq %rdx,%rdi
2001 jb L$8x_reduction_loop
2002 ret
2003
2004
2005
2006.p2align 5
2007__bn_post4x_internal:
2008
2009 movq 0(%rbp),%r12
2010 leaq (%rdi,%r9,1),%rbx
2011 movq %r9,%rcx
2012.byte 102,72,15,126,207
2013 negq %rax
2014.byte 102,72,15,126,206
2015 sarq $3+2,%rcx
2016 decq %r12
2017 xorq %r10,%r10
2018 movq 8(%rbp),%r13
2019 movq 16(%rbp),%r14
2020 movq 24(%rbp),%r15
2021 jmp L$sqr4x_sub_entry
2022
2023.p2align 4
2024L$sqr4x_sub:
2025 movq 0(%rbp),%r12
2026 movq 8(%rbp),%r13
2027 movq 16(%rbp),%r14
2028 movq 24(%rbp),%r15
2029L$sqr4x_sub_entry:
2030 leaq 32(%rbp),%rbp
2031 notq %r12
2032 notq %r13
2033 notq %r14
2034 notq %r15
2035 andq %rax,%r12
2036 andq %rax,%r13
2037 andq %rax,%r14
2038 andq %rax,%r15
2039
2040 negq %r10
2041 adcq 0(%rbx),%r12
2042 adcq 8(%rbx),%r13
2043 adcq 16(%rbx),%r14
2044 adcq 24(%rbx),%r15
2045 movq %r12,0(%rdi)
2046 leaq 32(%rbx),%rbx
2047 movq %r13,8(%rdi)
2048 sbbq %r10,%r10
2049 movq %r14,16(%rdi)
2050 movq %r15,24(%rdi)
2051 leaq 32(%rdi),%rdi
2052
2053 incq %rcx
2054 jnz L$sqr4x_sub
2055
2056 movq %r9,%r10
2057 negq %r9
2058 ret
2059
2060
David Benjamin3efe2eb2024-05-08 22:24:27 -07002061.globl _bn_mulx4x_mont_gather5
2062.private_extern _bn_mulx4x_mont_gather5
David Benjaminfe0c91e2024-03-18 15:37:24 +10002063
2064.p2align 5
David Benjamin3efe2eb2024-05-08 22:24:27 -07002065_bn_mulx4x_mont_gather5:
David Benjaminfe0c91e2024-03-18 15:37:24 +10002066
David Benjamin3efe2eb2024-05-08 22:24:27 -07002067_CET_ENDBR
David Benjaminfe0c91e2024-03-18 15:37:24 +10002068 movq %rsp,%rax
2069
David Benjaminfe0c91e2024-03-18 15:37:24 +10002070 pushq %rbx
2071
2072 pushq %rbp
2073
2074 pushq %r12
2075
2076 pushq %r13
2077
2078 pushq %r14
2079
2080 pushq %r15
2081
2082L$mulx4x_prologue:
2083
David Benjamin3efe2eb2024-05-08 22:24:27 -07002084
2085
2086
David Benjaminfe0c91e2024-03-18 15:37:24 +10002087 shll $3,%r9d
2088 leaq (%r9,%r9,2),%r10
2089 negq %r9
2090 movq (%r8),%r8
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101 leaq -320(%rsp,%r9,2),%r11
2102 movq %rsp,%rbp
2103 subq %rdi,%r11
2104 andq $4095,%r11
2105 cmpq %r11,%r10
2106 jb L$mulx4xsp_alt
2107 subq %r11,%rbp
2108 leaq -320(%rbp,%r9,2),%rbp
2109 jmp L$mulx4xsp_done
2110
2111L$mulx4xsp_alt:
2112 leaq 4096-320(,%r9,2),%r10
2113 leaq -320(%rbp,%r9,2),%rbp
2114 subq %r10,%r11
2115 movq $0,%r10
2116 cmovcq %r10,%r11
2117 subq %r11,%rbp
2118L$mulx4xsp_done:
2119 andq $-64,%rbp
2120 movq %rsp,%r11
2121 subq %rbp,%r11
2122 andq $-4096,%r11
2123 leaq (%r11,%rbp,1),%rsp
2124 movq (%rsp),%r10
2125 cmpq %rbp,%rsp
2126 ja L$mulx4x_page_walk
2127 jmp L$mulx4x_page_walk_done
2128
2129L$mulx4x_page_walk:
2130 leaq -4096(%rsp),%rsp
2131 movq (%rsp),%r10
2132 cmpq %rbp,%rsp
2133 ja L$mulx4x_page_walk
2134L$mulx4x_page_walk_done:
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148 movq %r8,32(%rsp)
2149 movq %rax,40(%rsp)
2150
2151L$mulx4x_body:
2152 call mulx4x_internal
2153
2154 movq 40(%rsp),%rsi
2155
2156 movq $1,%rax
2157
2158 movq -48(%rsi),%r15
2159
2160 movq -40(%rsi),%r14
2161
2162 movq -32(%rsi),%r13
2163
2164 movq -24(%rsi),%r12
2165
2166 movq -16(%rsi),%rbp
2167
2168 movq -8(%rsi),%rbx
2169
2170 leaq (%rsi),%rsp
2171
2172L$mulx4x_epilogue:
2173 ret
2174
2175
2176
2177
2178.p2align 5
2179mulx4x_internal:
2180
2181 movq %r9,8(%rsp)
2182 movq %r9,%r10
2183 negq %r9
2184 shlq $5,%r9
2185 negq %r10
2186 leaq 128(%rdx,%r9,1),%r13
2187 shrq $5+5,%r9
2188 movd 8(%rax),%xmm5
2189 subq $1,%r9
2190 leaq L$inc(%rip),%rax
2191 movq %r13,16+8(%rsp)
2192 movq %r9,24+8(%rsp)
2193 movq %rdi,56+8(%rsp)
2194 movdqa 0(%rax),%xmm0
2195 movdqa 16(%rax),%xmm1
2196 leaq 88-112(%rsp,%r10,1),%r10
2197 leaq 128(%rdx),%rdi
2198
2199 pshufd $0,%xmm5,%xmm5
2200 movdqa %xmm1,%xmm4
2201.byte 0x67
2202 movdqa %xmm1,%xmm2
2203.byte 0x67
2204 paddd %xmm0,%xmm1
2205 pcmpeqd %xmm5,%xmm0
2206 movdqa %xmm4,%xmm3
2207 paddd %xmm1,%xmm2
2208 pcmpeqd %xmm5,%xmm1
2209 movdqa %xmm0,112(%r10)
2210 movdqa %xmm4,%xmm0
2211
2212 paddd %xmm2,%xmm3
2213 pcmpeqd %xmm5,%xmm2
2214 movdqa %xmm1,128(%r10)
2215 movdqa %xmm4,%xmm1
2216
2217 paddd %xmm3,%xmm0
2218 pcmpeqd %xmm5,%xmm3
2219 movdqa %xmm2,144(%r10)
2220 movdqa %xmm4,%xmm2
2221
2222 paddd %xmm0,%xmm1
2223 pcmpeqd %xmm5,%xmm0
2224 movdqa %xmm3,160(%r10)
2225 movdqa %xmm4,%xmm3
2226 paddd %xmm1,%xmm2
2227 pcmpeqd %xmm5,%xmm1
2228 movdqa %xmm0,176(%r10)
2229 movdqa %xmm4,%xmm0
2230
2231 paddd %xmm2,%xmm3
2232 pcmpeqd %xmm5,%xmm2
2233 movdqa %xmm1,192(%r10)
2234 movdqa %xmm4,%xmm1
2235
2236 paddd %xmm3,%xmm0
2237 pcmpeqd %xmm5,%xmm3
2238 movdqa %xmm2,208(%r10)
2239 movdqa %xmm4,%xmm2
2240
2241 paddd %xmm0,%xmm1
2242 pcmpeqd %xmm5,%xmm0
2243 movdqa %xmm3,224(%r10)
2244 movdqa %xmm4,%xmm3
2245 paddd %xmm1,%xmm2
2246 pcmpeqd %xmm5,%xmm1
2247 movdqa %xmm0,240(%r10)
2248 movdqa %xmm4,%xmm0
2249
2250 paddd %xmm2,%xmm3
2251 pcmpeqd %xmm5,%xmm2
2252 movdqa %xmm1,256(%r10)
2253 movdqa %xmm4,%xmm1
2254
2255 paddd %xmm3,%xmm0
2256 pcmpeqd %xmm5,%xmm3
2257 movdqa %xmm2,272(%r10)
2258 movdqa %xmm4,%xmm2
2259
2260 paddd %xmm0,%xmm1
2261 pcmpeqd %xmm5,%xmm0
2262 movdqa %xmm3,288(%r10)
2263 movdqa %xmm4,%xmm3
2264.byte 0x67
2265 paddd %xmm1,%xmm2
2266 pcmpeqd %xmm5,%xmm1
2267 movdqa %xmm0,304(%r10)
2268
2269 paddd %xmm2,%xmm3
2270 pcmpeqd %xmm5,%xmm2
2271 movdqa %xmm1,320(%r10)
2272
2273 pcmpeqd %xmm5,%xmm3
2274 movdqa %xmm2,336(%r10)
2275
2276 pand 64(%rdi),%xmm0
2277 pand 80(%rdi),%xmm1
2278 pand 96(%rdi),%xmm2
2279 movdqa %xmm3,352(%r10)
2280 pand 112(%rdi),%xmm3
2281 por %xmm2,%xmm0
2282 por %xmm3,%xmm1
2283 movdqa -128(%rdi),%xmm4
2284 movdqa -112(%rdi),%xmm5
2285 movdqa -96(%rdi),%xmm2
2286 pand 112(%r10),%xmm4
2287 movdqa -80(%rdi),%xmm3
2288 pand 128(%r10),%xmm5
2289 por %xmm4,%xmm0
2290 pand 144(%r10),%xmm2
2291 por %xmm5,%xmm1
2292 pand 160(%r10),%xmm3
2293 por %xmm2,%xmm0
2294 por %xmm3,%xmm1
2295 movdqa -64(%rdi),%xmm4
2296 movdqa -48(%rdi),%xmm5
2297 movdqa -32(%rdi),%xmm2
2298 pand 176(%r10),%xmm4
2299 movdqa -16(%rdi),%xmm3
2300 pand 192(%r10),%xmm5
2301 por %xmm4,%xmm0
2302 pand 208(%r10),%xmm2
2303 por %xmm5,%xmm1
2304 pand 224(%r10),%xmm3
2305 por %xmm2,%xmm0
2306 por %xmm3,%xmm1
2307 movdqa 0(%rdi),%xmm4
2308 movdqa 16(%rdi),%xmm5
2309 movdqa 32(%rdi),%xmm2
2310 pand 240(%r10),%xmm4
2311 movdqa 48(%rdi),%xmm3
2312 pand 256(%r10),%xmm5
2313 por %xmm4,%xmm0
2314 pand 272(%r10),%xmm2
2315 por %xmm5,%xmm1
2316 pand 288(%r10),%xmm3
2317 por %xmm2,%xmm0
2318 por %xmm3,%xmm1
2319 pxor %xmm1,%xmm0
2320
2321 pshufd $0x4e,%xmm0,%xmm1
2322 por %xmm1,%xmm0
2323 leaq 256(%rdi),%rdi
2324.byte 102,72,15,126,194
2325 leaq 64+32+8(%rsp),%rbx
2326
2327 movq %rdx,%r9
2328 mulxq 0(%rsi),%r8,%rax
2329 mulxq 8(%rsi),%r11,%r12
2330 addq %rax,%r11
2331 mulxq 16(%rsi),%rax,%r13
2332 adcq %rax,%r12
2333 adcq $0,%r13
2334 mulxq 24(%rsi),%rax,%r14
2335
2336 movq %r8,%r15
2337 imulq 32+8(%rsp),%r8
2338 xorq %rbp,%rbp
2339 movq %r8,%rdx
2340
2341 movq %rdi,8+8(%rsp)
2342
2343 leaq 32(%rsi),%rsi
2344 adcxq %rax,%r13
2345 adcxq %rbp,%r14
2346
2347 mulxq 0(%rcx),%rax,%r10
2348 adcxq %rax,%r15
2349 adoxq %r11,%r10
2350 mulxq 8(%rcx),%rax,%r11
2351 adcxq %rax,%r10
2352 adoxq %r12,%r11
2353 mulxq 16(%rcx),%rax,%r12
2354 movq 24+8(%rsp),%rdi
2355 movq %r10,-32(%rbx)
2356 adcxq %rax,%r11
2357 adoxq %r13,%r12
2358 mulxq 24(%rcx),%rax,%r15
2359 movq %r9,%rdx
2360 movq %r11,-24(%rbx)
2361 adcxq %rax,%r12
2362 adoxq %rbp,%r15
2363 leaq 32(%rcx),%rcx
2364 movq %r12,-16(%rbx)
2365 jmp L$mulx4x_1st
2366
2367.p2align 5
2368L$mulx4x_1st:
2369 adcxq %rbp,%r15
2370 mulxq 0(%rsi),%r10,%rax
2371 adcxq %r14,%r10
2372 mulxq 8(%rsi),%r11,%r14
2373 adcxq %rax,%r11
2374 mulxq 16(%rsi),%r12,%rax
2375 adcxq %r14,%r12
2376 mulxq 24(%rsi),%r13,%r14
2377.byte 0x67,0x67
2378 movq %r8,%rdx
2379 adcxq %rax,%r13
2380 adcxq %rbp,%r14
2381 leaq 32(%rsi),%rsi
2382 leaq 32(%rbx),%rbx
2383
2384 adoxq %r15,%r10
2385 mulxq 0(%rcx),%rax,%r15
2386 adcxq %rax,%r10
2387 adoxq %r15,%r11
2388 mulxq 8(%rcx),%rax,%r15
2389 adcxq %rax,%r11
2390 adoxq %r15,%r12
2391 mulxq 16(%rcx),%rax,%r15
2392 movq %r10,-40(%rbx)
2393 adcxq %rax,%r12
2394 movq %r11,-32(%rbx)
2395 adoxq %r15,%r13
2396 mulxq 24(%rcx),%rax,%r15
2397 movq %r9,%rdx
2398 movq %r12,-24(%rbx)
2399 adcxq %rax,%r13
2400 adoxq %rbp,%r15
2401 leaq 32(%rcx),%rcx
2402 movq %r13,-16(%rbx)
2403
2404 decq %rdi
2405 jnz L$mulx4x_1st
2406
2407 movq 8(%rsp),%rax
2408 adcq %rbp,%r15
2409 leaq (%rsi,%rax,1),%rsi
2410 addq %r15,%r14
2411 movq 8+8(%rsp),%rdi
2412 adcq %rbp,%rbp
2413 movq %r14,-8(%rbx)
2414 jmp L$mulx4x_outer
2415
2416.p2align 5
2417L$mulx4x_outer:
2418 leaq 16-256(%rbx),%r10
2419 pxor %xmm4,%xmm4
2420.byte 0x67,0x67
2421 pxor %xmm5,%xmm5
2422 movdqa -128(%rdi),%xmm0
2423 movdqa -112(%rdi),%xmm1
2424 movdqa -96(%rdi),%xmm2
2425 pand 256(%r10),%xmm0
2426 movdqa -80(%rdi),%xmm3
2427 pand 272(%r10),%xmm1
2428 por %xmm0,%xmm4
2429 pand 288(%r10),%xmm2
2430 por %xmm1,%xmm5
2431 pand 304(%r10),%xmm3
2432 por %xmm2,%xmm4
2433 por %xmm3,%xmm5
2434 movdqa -64(%rdi),%xmm0
2435 movdqa -48(%rdi),%xmm1
2436 movdqa -32(%rdi),%xmm2
2437 pand 320(%r10),%xmm0
2438 movdqa -16(%rdi),%xmm3
2439 pand 336(%r10),%xmm1
2440 por %xmm0,%xmm4
2441 pand 352(%r10),%xmm2
2442 por %xmm1,%xmm5
2443 pand 368(%r10),%xmm3
2444 por %xmm2,%xmm4
2445 por %xmm3,%xmm5
2446 movdqa 0(%rdi),%xmm0
2447 movdqa 16(%rdi),%xmm1
2448 movdqa 32(%rdi),%xmm2
2449 pand 384(%r10),%xmm0
2450 movdqa 48(%rdi),%xmm3
2451 pand 400(%r10),%xmm1
2452 por %xmm0,%xmm4
2453 pand 416(%r10),%xmm2
2454 por %xmm1,%xmm5
2455 pand 432(%r10),%xmm3
2456 por %xmm2,%xmm4
2457 por %xmm3,%xmm5
2458 movdqa 64(%rdi),%xmm0
2459 movdqa 80(%rdi),%xmm1
2460 movdqa 96(%rdi),%xmm2
2461 pand 448(%r10),%xmm0
2462 movdqa 112(%rdi),%xmm3
2463 pand 464(%r10),%xmm1
2464 por %xmm0,%xmm4
2465 pand 480(%r10),%xmm2
2466 por %xmm1,%xmm5
2467 pand 496(%r10),%xmm3
2468 por %xmm2,%xmm4
2469 por %xmm3,%xmm5
2470 por %xmm5,%xmm4
2471
2472 pshufd $0x4e,%xmm4,%xmm0
2473 por %xmm4,%xmm0
2474 leaq 256(%rdi),%rdi
2475.byte 102,72,15,126,194
2476
2477 movq %rbp,(%rbx)
2478 leaq 32(%rbx,%rax,1),%rbx
2479 mulxq 0(%rsi),%r8,%r11
2480 xorq %rbp,%rbp
2481 movq %rdx,%r9
2482 mulxq 8(%rsi),%r14,%r12
2483 adoxq -32(%rbx),%r8
2484 adcxq %r14,%r11
2485 mulxq 16(%rsi),%r15,%r13
2486 adoxq -24(%rbx),%r11
2487 adcxq %r15,%r12
2488 mulxq 24(%rsi),%rdx,%r14
2489 adoxq -16(%rbx),%r12
2490 adcxq %rdx,%r13
2491 leaq (%rcx,%rax,1),%rcx
2492 leaq 32(%rsi),%rsi
2493 adoxq -8(%rbx),%r13
2494 adcxq %rbp,%r14
2495 adoxq %rbp,%r14
2496
2497 movq %r8,%r15
2498 imulq 32+8(%rsp),%r8
2499
2500 movq %r8,%rdx
2501 xorq %rbp,%rbp
2502 movq %rdi,8+8(%rsp)
2503
2504 mulxq 0(%rcx),%rax,%r10
2505 adcxq %rax,%r15
2506 adoxq %r11,%r10
2507 mulxq 8(%rcx),%rax,%r11
2508 adcxq %rax,%r10
2509 adoxq %r12,%r11
2510 mulxq 16(%rcx),%rax,%r12
2511 adcxq %rax,%r11
2512 adoxq %r13,%r12
2513 mulxq 24(%rcx),%rax,%r15
2514 movq %r9,%rdx
2515 movq 24+8(%rsp),%rdi
2516 movq %r10,-32(%rbx)
2517 adcxq %rax,%r12
2518 movq %r11,-24(%rbx)
2519 adoxq %rbp,%r15
2520 movq %r12,-16(%rbx)
2521 leaq 32(%rcx),%rcx
2522 jmp L$mulx4x_inner
2523
2524.p2align 5
2525L$mulx4x_inner:
2526 mulxq 0(%rsi),%r10,%rax
2527 adcxq %rbp,%r15
2528 adoxq %r14,%r10
2529 mulxq 8(%rsi),%r11,%r14
2530 adcxq 0(%rbx),%r10
2531 adoxq %rax,%r11
2532 mulxq 16(%rsi),%r12,%rax
2533 adcxq 8(%rbx),%r11
2534 adoxq %r14,%r12
2535 mulxq 24(%rsi),%r13,%r14
2536 movq %r8,%rdx
2537 adcxq 16(%rbx),%r12
2538 adoxq %rax,%r13
2539 adcxq 24(%rbx),%r13
2540 adoxq %rbp,%r14
2541 leaq 32(%rsi),%rsi
2542 leaq 32(%rbx),%rbx
2543 adcxq %rbp,%r14
2544
2545 adoxq %r15,%r10
2546 mulxq 0(%rcx),%rax,%r15
2547 adcxq %rax,%r10
2548 adoxq %r15,%r11
2549 mulxq 8(%rcx),%rax,%r15
2550 adcxq %rax,%r11
2551 adoxq %r15,%r12
2552 mulxq 16(%rcx),%rax,%r15
2553 movq %r10,-40(%rbx)
2554 adcxq %rax,%r12
2555 adoxq %r15,%r13
2556 movq %r11,-32(%rbx)
2557 mulxq 24(%rcx),%rax,%r15
2558 movq %r9,%rdx
2559 leaq 32(%rcx),%rcx
2560 movq %r12,-24(%rbx)
2561 adcxq %rax,%r13
2562 adoxq %rbp,%r15
2563 movq %r13,-16(%rbx)
2564
2565 decq %rdi
2566 jnz L$mulx4x_inner
2567
2568 movq 0+8(%rsp),%rax
2569 adcq %rbp,%r15
2570 subq 0(%rbx),%rdi
2571 movq 8+8(%rsp),%rdi
2572 movq 16+8(%rsp),%r10
2573 adcq %r15,%r14
2574 leaq (%rsi,%rax,1),%rsi
2575 adcq %rbp,%rbp
2576 movq %r14,-8(%rbx)
2577
2578 cmpq %r10,%rdi
2579 jb L$mulx4x_outer
2580
2581 movq -8(%rcx),%r10
2582 movq %rbp,%r8
2583 movq (%rcx,%rax,1),%r12
2584 leaq (%rcx,%rax,1),%rbp
2585 movq %rax,%rcx
2586 leaq (%rbx,%rax,1),%rdi
2587 xorl %eax,%eax
2588 xorq %r15,%r15
2589 subq %r14,%r10
2590 adcq %r15,%r15
2591 orq %r15,%r8
2592 sarq $3+2,%rcx
2593 subq %r8,%rax
2594 movq 56+8(%rsp),%rdx
2595 decq %r12
2596 movq 8(%rbp),%r13
2597 xorq %r8,%r8
2598 movq 16(%rbp),%r14
2599 movq 24(%rbp),%r15
2600 jmp L$sqrx4x_sub_entry
2601
2602
David Benjamin3efe2eb2024-05-08 22:24:27 -07002603.globl _bn_powerx5
2604.private_extern _bn_powerx5
David Benjaminfe0c91e2024-03-18 15:37:24 +10002605
2606.p2align 5
David Benjamin3efe2eb2024-05-08 22:24:27 -07002607_bn_powerx5:
David Benjaminfe0c91e2024-03-18 15:37:24 +10002608
David Benjamin3efe2eb2024-05-08 22:24:27 -07002609_CET_ENDBR
David Benjaminfe0c91e2024-03-18 15:37:24 +10002610 movq %rsp,%rax
2611
David Benjaminfe0c91e2024-03-18 15:37:24 +10002612 pushq %rbx
2613
2614 pushq %rbp
2615
2616 pushq %r12
2617
2618 pushq %r13
2619
2620 pushq %r14
2621
2622 pushq %r15
2623
2624L$powerx5_prologue:
2625
David Benjamin3efe2eb2024-05-08 22:24:27 -07002626
2627
2628
David Benjaminfe0c91e2024-03-18 15:37:24 +10002629 shll $3,%r9d
2630 leaq (%r9,%r9,2),%r10
2631 negq %r9
2632 movq (%r8),%r8
2633
2634
2635
2636
2637
2638
2639
2640
2641 leaq -320(%rsp,%r9,2),%r11
2642 movq %rsp,%rbp
2643 subq %rdi,%r11
2644 andq $4095,%r11
2645 cmpq %r11,%r10
2646 jb L$pwrx_sp_alt
2647 subq %r11,%rbp
2648 leaq -320(%rbp,%r9,2),%rbp
2649 jmp L$pwrx_sp_done
2650
2651.p2align 5
2652L$pwrx_sp_alt:
2653 leaq 4096-320(,%r9,2),%r10
2654 leaq -320(%rbp,%r9,2),%rbp
2655 subq %r10,%r11
2656 movq $0,%r10
2657 cmovcq %r10,%r11
2658 subq %r11,%rbp
2659L$pwrx_sp_done:
2660 andq $-64,%rbp
2661 movq %rsp,%r11
2662 subq %rbp,%r11
2663 andq $-4096,%r11
2664 leaq (%r11,%rbp,1),%rsp
2665 movq (%rsp),%r10
2666 cmpq %rbp,%rsp
2667 ja L$pwrx_page_walk
2668 jmp L$pwrx_page_walk_done
2669
2670L$pwrx_page_walk:
2671 leaq -4096(%rsp),%rsp
2672 movq (%rsp),%r10
2673 cmpq %rbp,%rsp
2674 ja L$pwrx_page_walk
2675L$pwrx_page_walk_done:
2676
2677 movq %r9,%r10
2678 negq %r9
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691 pxor %xmm0,%xmm0
2692.byte 102,72,15,110,207
2693.byte 102,72,15,110,209
2694.byte 102,73,15,110,218
2695.byte 102,72,15,110,226
2696 movq %r8,32(%rsp)
2697 movq %rax,40(%rsp)
2698
2699L$powerx5_body:
2700
2701 call __bn_sqrx8x_internal
2702 call __bn_postx4x_internal
2703 call __bn_sqrx8x_internal
2704 call __bn_postx4x_internal
2705 call __bn_sqrx8x_internal
2706 call __bn_postx4x_internal
2707 call __bn_sqrx8x_internal
2708 call __bn_postx4x_internal
2709 call __bn_sqrx8x_internal
2710 call __bn_postx4x_internal
2711
2712 movq %r10,%r9
2713 movq %rsi,%rdi
2714.byte 102,72,15,126,209
2715.byte 102,72,15,126,226
2716 movq 40(%rsp),%rax
2717
2718 call mulx4x_internal
2719
2720 movq 40(%rsp),%rsi
2721
2722 movq $1,%rax
2723
2724 movq -48(%rsi),%r15
2725
2726 movq -40(%rsi),%r14
2727
2728 movq -32(%rsi),%r13
2729
2730 movq -24(%rsi),%r12
2731
2732 movq -16(%rsi),%rbp
2733
2734 movq -8(%rsi),%rbx
2735
2736 leaq (%rsi),%rsp
2737
2738L$powerx5_epilogue:
2739 ret
2740
2741
2742
2743.globl _bn_sqrx8x_internal
2744.private_extern _bn_sqrx8x_internal
2745.private_extern _bn_sqrx8x_internal
2746
2747.p2align 5
2748_bn_sqrx8x_internal:
2749__bn_sqrx8x_internal:
2750
2751_CET_ENDBR
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792 leaq 48+8(%rsp),%rdi
2793 leaq (%rsi,%r9,1),%rbp
2794 movq %r9,0+8(%rsp)
2795 movq %rbp,8+8(%rsp)
2796 jmp L$sqr8x_zero_start
2797
2798.p2align 5
2799.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2800L$sqrx8x_zero:
2801.byte 0x3e
2802 movdqa %xmm0,0(%rdi)
2803 movdqa %xmm0,16(%rdi)
2804 movdqa %xmm0,32(%rdi)
2805 movdqa %xmm0,48(%rdi)
2806L$sqr8x_zero_start:
2807 movdqa %xmm0,64(%rdi)
2808 movdqa %xmm0,80(%rdi)
2809 movdqa %xmm0,96(%rdi)
2810 movdqa %xmm0,112(%rdi)
2811 leaq 128(%rdi),%rdi
2812 subq $64,%r9
2813 jnz L$sqrx8x_zero
2814
2815 movq 0(%rsi),%rdx
2816
2817 xorq %r10,%r10
2818 xorq %r11,%r11
2819 xorq %r12,%r12
2820 xorq %r13,%r13
2821 xorq %r14,%r14
2822 xorq %r15,%r15
2823 leaq 48+8(%rsp),%rdi
2824 xorq %rbp,%rbp
2825 jmp L$sqrx8x_outer_loop
2826
2827.p2align 5
2828L$sqrx8x_outer_loop:
2829 mulxq 8(%rsi),%r8,%rax
2830 adcxq %r9,%r8
2831 adoxq %rax,%r10
2832 mulxq 16(%rsi),%r9,%rax
2833 adcxq %r10,%r9
2834 adoxq %rax,%r11
2835.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2836 adcxq %r11,%r10
2837 adoxq %rax,%r12
2838.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2839 adcxq %r12,%r11
2840 adoxq %rax,%r13
2841 mulxq 40(%rsi),%r12,%rax
2842 adcxq %r13,%r12
2843 adoxq %rax,%r14
2844 mulxq 48(%rsi),%r13,%rax
2845 adcxq %r14,%r13
2846 adoxq %r15,%rax
2847 mulxq 56(%rsi),%r14,%r15
2848 movq 8(%rsi),%rdx
2849 adcxq %rax,%r14
2850 adoxq %rbp,%r15
2851 adcq 64(%rdi),%r15
2852 movq %r8,8(%rdi)
2853 movq %r9,16(%rdi)
2854 sbbq %rcx,%rcx
2855 xorq %rbp,%rbp
2856
2857
2858 mulxq 16(%rsi),%r8,%rbx
2859 mulxq 24(%rsi),%r9,%rax
2860 adcxq %r10,%r8
2861 adoxq %rbx,%r9
2862 mulxq 32(%rsi),%r10,%rbx
2863 adcxq %r11,%r9
2864 adoxq %rax,%r10
2865.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2866 adcxq %r12,%r10
2867 adoxq %rbx,%r11
2868.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2869 adcxq %r13,%r11
2870 adoxq %r14,%r12
2871.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2872 movq 16(%rsi),%rdx
2873 adcxq %rax,%r12
2874 adoxq %rbx,%r13
2875 adcxq %r15,%r13
2876 adoxq %rbp,%r14
2877 adcxq %rbp,%r14
2878
2879 movq %r8,24(%rdi)
2880 movq %r9,32(%rdi)
2881
2882 mulxq 24(%rsi),%r8,%rbx
2883 mulxq 32(%rsi),%r9,%rax
2884 adcxq %r10,%r8
2885 adoxq %rbx,%r9
2886 mulxq 40(%rsi),%r10,%rbx
2887 adcxq %r11,%r9
2888 adoxq %rax,%r10
2889.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2890 adcxq %r12,%r10
2891 adoxq %r13,%r11
2892.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2893.byte 0x3e
2894 movq 24(%rsi),%rdx
2895 adcxq %rbx,%r11
2896 adoxq %rax,%r12
2897 adcxq %r14,%r12
2898 movq %r8,40(%rdi)
2899 movq %r9,48(%rdi)
2900 mulxq 32(%rsi),%r8,%rax
2901 adoxq %rbp,%r13
2902 adcxq %rbp,%r13
2903
2904 mulxq 40(%rsi),%r9,%rbx
2905 adcxq %r10,%r8
2906 adoxq %rax,%r9
2907 mulxq 48(%rsi),%r10,%rax
2908 adcxq %r11,%r9
2909 adoxq %r12,%r10
2910 mulxq 56(%rsi),%r11,%r12
2911 movq 32(%rsi),%rdx
2912 movq 40(%rsi),%r14
2913 adcxq %rbx,%r10
2914 adoxq %rax,%r11
2915 movq 48(%rsi),%r15
2916 adcxq %r13,%r11
2917 adoxq %rbp,%r12
2918 adcxq %rbp,%r12
2919
2920 movq %r8,56(%rdi)
2921 movq %r9,64(%rdi)
2922
2923 mulxq %r14,%r9,%rax
2924 movq 56(%rsi),%r8
2925 adcxq %r10,%r9
2926 mulxq %r15,%r10,%rbx
2927 adoxq %rax,%r10
2928 adcxq %r11,%r10
2929 mulxq %r8,%r11,%rax
2930 movq %r14,%rdx
2931 adoxq %rbx,%r11
2932 adcxq %r12,%r11
2933
2934 adcxq %rbp,%rax
2935
2936 mulxq %r15,%r14,%rbx
2937 mulxq %r8,%r12,%r13
2938 movq %r15,%rdx
2939 leaq 64(%rsi),%rsi
2940 adcxq %r14,%r11
2941 adoxq %rbx,%r12
2942 adcxq %rax,%r12
2943 adoxq %rbp,%r13
2944
2945.byte 0x67,0x67
2946 mulxq %r8,%r8,%r14
2947 adcxq %r8,%r13
2948 adcxq %rbp,%r14
2949
2950 cmpq 8+8(%rsp),%rsi
2951 je L$sqrx8x_outer_break
2952
2953 negq %rcx
2954 movq $-8,%rcx
2955 movq %rbp,%r15
2956 movq 64(%rdi),%r8
2957 adcxq 72(%rdi),%r9
2958 adcxq 80(%rdi),%r10
2959 adcxq 88(%rdi),%r11
2960 adcq 96(%rdi),%r12
2961 adcq 104(%rdi),%r13
2962 adcq 112(%rdi),%r14
2963 adcq 120(%rdi),%r15
2964 leaq (%rsi),%rbp
2965 leaq 128(%rdi),%rdi
2966 sbbq %rax,%rax
2967
2968 movq -64(%rsi),%rdx
2969 movq %rax,16+8(%rsp)
2970 movq %rdi,24+8(%rsp)
2971
2972
2973 xorl %eax,%eax
2974 jmp L$sqrx8x_loop
2975
2976.p2align 5
2977L$sqrx8x_loop:
2978 movq %r8,%rbx
2979 mulxq 0(%rbp),%rax,%r8
2980 adcxq %rax,%rbx
2981 adoxq %r9,%r8
2982
2983 mulxq 8(%rbp),%rax,%r9
2984 adcxq %rax,%r8
2985 adoxq %r10,%r9
2986
2987 mulxq 16(%rbp),%rax,%r10
2988 adcxq %rax,%r9
2989 adoxq %r11,%r10
2990
2991 mulxq 24(%rbp),%rax,%r11
2992 adcxq %rax,%r10
2993 adoxq %r12,%r11
2994
2995.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
2996 adcxq %rax,%r11
2997 adoxq %r13,%r12
2998
2999 mulxq 40(%rbp),%rax,%r13
3000 adcxq %rax,%r12
3001 adoxq %r14,%r13
3002
3003 mulxq 48(%rbp),%rax,%r14
3004 movq %rbx,(%rdi,%rcx,8)
3005 movl $0,%ebx
3006 adcxq %rax,%r13
3007 adoxq %r15,%r14
3008
3009.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3010 movq 8(%rsi,%rcx,8),%rdx
3011 adcxq %rax,%r14
3012 adoxq %rbx,%r15
3013 adcxq %rbx,%r15
3014
3015.byte 0x67
3016 incq %rcx
3017 jnz L$sqrx8x_loop
3018
3019 leaq 64(%rbp),%rbp
3020 movq $-8,%rcx
3021 cmpq 8+8(%rsp),%rbp
3022 je L$sqrx8x_break
3023
3024 subq 16+8(%rsp),%rbx
3025.byte 0x66
3026 movq -64(%rsi),%rdx
3027 adcxq 0(%rdi),%r8
3028 adcxq 8(%rdi),%r9
3029 adcq 16(%rdi),%r10
3030 adcq 24(%rdi),%r11
3031 adcq 32(%rdi),%r12
3032 adcq 40(%rdi),%r13
3033 adcq 48(%rdi),%r14
3034 adcq 56(%rdi),%r15
3035 leaq 64(%rdi),%rdi
3036.byte 0x67
3037 sbbq %rax,%rax
3038 xorl %ebx,%ebx
3039 movq %rax,16+8(%rsp)
3040 jmp L$sqrx8x_loop
3041
3042.p2align 5
3043L$sqrx8x_break:
3044 xorq %rbp,%rbp
3045 subq 16+8(%rsp),%rbx
3046 adcxq %rbp,%r8
3047 movq 24+8(%rsp),%rcx
3048 adcxq %rbp,%r9
3049 movq 0(%rsi),%rdx
3050 adcq $0,%r10
3051 movq %r8,0(%rdi)
3052 adcq $0,%r11
3053 adcq $0,%r12
3054 adcq $0,%r13
3055 adcq $0,%r14
3056 adcq $0,%r15
3057 cmpq %rcx,%rdi
3058 je L$sqrx8x_outer_loop
3059
3060 movq %r9,8(%rdi)
3061 movq 8(%rcx),%r9
3062 movq %r10,16(%rdi)
3063 movq 16(%rcx),%r10
3064 movq %r11,24(%rdi)
3065 movq 24(%rcx),%r11
3066 movq %r12,32(%rdi)
3067 movq 32(%rcx),%r12
3068 movq %r13,40(%rdi)
3069 movq 40(%rcx),%r13
3070 movq %r14,48(%rdi)
3071 movq 48(%rcx),%r14
3072 movq %r15,56(%rdi)
3073 movq 56(%rcx),%r15
3074 movq %rcx,%rdi
3075 jmp L$sqrx8x_outer_loop
3076
3077.p2align 5
3078L$sqrx8x_outer_break:
3079 movq %r9,72(%rdi)
3080.byte 102,72,15,126,217
3081 movq %r10,80(%rdi)
3082 movq %r11,88(%rdi)
3083 movq %r12,96(%rdi)
3084 movq %r13,104(%rdi)
3085 movq %r14,112(%rdi)
3086 leaq 48+8(%rsp),%rdi
3087 movq (%rsi,%rcx,1),%rdx
3088
3089 movq 8(%rdi),%r11
3090 xorq %r10,%r10
3091 movq 0+8(%rsp),%r9
3092 adoxq %r11,%r11
3093 movq 16(%rdi),%r12
3094 movq 24(%rdi),%r13
3095
3096
3097.p2align 5
3098L$sqrx4x_shift_n_add:
3099 mulxq %rdx,%rax,%rbx
3100 adoxq %r12,%r12
3101 adcxq %r10,%rax
3102.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3103.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3104 adoxq %r13,%r13
3105 adcxq %r11,%rbx
3106 movq 40(%rdi),%r11
3107 movq %rax,0(%rdi)
3108 movq %rbx,8(%rdi)
3109
3110 mulxq %rdx,%rax,%rbx
3111 adoxq %r10,%r10
3112 adcxq %r12,%rax
3113 movq 16(%rsi,%rcx,1),%rdx
3114 movq 48(%rdi),%r12
3115 adoxq %r11,%r11
3116 adcxq %r13,%rbx
3117 movq 56(%rdi),%r13
3118 movq %rax,16(%rdi)
3119 movq %rbx,24(%rdi)
3120
3121 mulxq %rdx,%rax,%rbx
3122 adoxq %r12,%r12
3123 adcxq %r10,%rax
3124 movq 24(%rsi,%rcx,1),%rdx
3125 leaq 32(%rcx),%rcx
3126 movq 64(%rdi),%r10
3127 adoxq %r13,%r13
3128 adcxq %r11,%rbx
3129 movq 72(%rdi),%r11
3130 movq %rax,32(%rdi)
3131 movq %rbx,40(%rdi)
3132
3133 mulxq %rdx,%rax,%rbx
3134 adoxq %r10,%r10
3135 adcxq %r12,%rax
3136 jrcxz L$sqrx4x_shift_n_add_break
3137.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3138 adoxq %r11,%r11
3139 adcxq %r13,%rbx
3140 movq 80(%rdi),%r12
3141 movq 88(%rdi),%r13
3142 movq %rax,48(%rdi)
3143 movq %rbx,56(%rdi)
3144 leaq 64(%rdi),%rdi
3145 nop
3146 jmp L$sqrx4x_shift_n_add
3147
3148.p2align 5
3149L$sqrx4x_shift_n_add_break:
3150 adcxq %r13,%rbx
3151 movq %rax,48(%rdi)
3152 movq %rbx,56(%rdi)
3153 leaq 64(%rdi),%rdi
3154.byte 102,72,15,126,213
3155__bn_sqrx8x_reduction:
3156 xorl %eax,%eax
3157 movq 32+8(%rsp),%rbx
3158 movq 48+8(%rsp),%rdx
3159 leaq -64(%rbp,%r9,1),%rcx
3160
3161 movq %rcx,0+8(%rsp)
3162 movq %rdi,8+8(%rsp)
3163
3164 leaq 48+8(%rsp),%rdi
3165 jmp L$sqrx8x_reduction_loop
3166
3167.p2align 5
3168L$sqrx8x_reduction_loop:
3169 movq 8(%rdi),%r9
3170 movq 16(%rdi),%r10
3171 movq 24(%rdi),%r11
3172 movq 32(%rdi),%r12
3173 movq %rdx,%r8
3174 imulq %rbx,%rdx
3175 movq 40(%rdi),%r13
3176 movq 48(%rdi),%r14
3177 movq 56(%rdi),%r15
3178 movq %rax,24+8(%rsp)
3179
3180 leaq 64(%rdi),%rdi
3181 xorq %rsi,%rsi
3182 movq $-8,%rcx
3183 jmp L$sqrx8x_reduce
3184
3185.p2align 5
3186L$sqrx8x_reduce:
3187 movq %r8,%rbx
3188 mulxq 0(%rbp),%rax,%r8
3189 adcxq %rbx,%rax
3190 adoxq %r9,%r8
3191
3192 mulxq 8(%rbp),%rbx,%r9
3193 adcxq %rbx,%r8
3194 adoxq %r10,%r9
3195
3196 mulxq 16(%rbp),%rbx,%r10
3197 adcxq %rbx,%r9
3198 adoxq %r11,%r10
3199
3200 mulxq 24(%rbp),%rbx,%r11
3201 adcxq %rbx,%r10
3202 adoxq %r12,%r11
3203
3204.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3205 movq %rdx,%rax
3206 movq %r8,%rdx
3207 adcxq %rbx,%r11
3208 adoxq %r13,%r12
3209
3210 mulxq 32+8(%rsp),%rbx,%rdx
3211 movq %rax,%rdx
3212 movq %rax,64+48+8(%rsp,%rcx,8)
3213
3214 mulxq 40(%rbp),%rax,%r13
3215 adcxq %rax,%r12
3216 adoxq %r14,%r13
3217
3218 mulxq 48(%rbp),%rax,%r14
3219 adcxq %rax,%r13
3220 adoxq %r15,%r14
3221
3222 mulxq 56(%rbp),%rax,%r15
3223 movq %rbx,%rdx
3224 adcxq %rax,%r14
3225 adoxq %rsi,%r15
3226 adcxq %rsi,%r15
3227
3228.byte 0x67,0x67,0x67
3229 incq %rcx
3230 jnz L$sqrx8x_reduce
3231
3232 movq %rsi,%rax
3233 cmpq 0+8(%rsp),%rbp
3234 jae L$sqrx8x_no_tail
3235
3236 movq 48+8(%rsp),%rdx
3237 addq 0(%rdi),%r8
3238 leaq 64(%rbp),%rbp
3239 movq $-8,%rcx
3240 adcxq 8(%rdi),%r9
3241 adcxq 16(%rdi),%r10
3242 adcq 24(%rdi),%r11
3243 adcq 32(%rdi),%r12
3244 adcq 40(%rdi),%r13
3245 adcq 48(%rdi),%r14
3246 adcq 56(%rdi),%r15
3247 leaq 64(%rdi),%rdi
3248 sbbq %rax,%rax
3249
3250 xorq %rsi,%rsi
3251 movq %rax,16+8(%rsp)
3252 jmp L$sqrx8x_tail
3253
3254.p2align 5
3255L$sqrx8x_tail:
3256 movq %r8,%rbx
3257 mulxq 0(%rbp),%rax,%r8
3258 adcxq %rax,%rbx
3259 adoxq %r9,%r8
3260
3261 mulxq 8(%rbp),%rax,%r9
3262 adcxq %rax,%r8
3263 adoxq %r10,%r9
3264
3265 mulxq 16(%rbp),%rax,%r10
3266 adcxq %rax,%r9
3267 adoxq %r11,%r10
3268
3269 mulxq 24(%rbp),%rax,%r11
3270 adcxq %rax,%r10
3271 adoxq %r12,%r11
3272
3273.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3274 adcxq %rax,%r11
3275 adoxq %r13,%r12
3276
3277 mulxq 40(%rbp),%rax,%r13
3278 adcxq %rax,%r12
3279 adoxq %r14,%r13
3280
3281 mulxq 48(%rbp),%rax,%r14
3282 adcxq %rax,%r13
3283 adoxq %r15,%r14
3284
3285 mulxq 56(%rbp),%rax,%r15
3286 movq 72+48+8(%rsp,%rcx,8),%rdx
3287 adcxq %rax,%r14
3288 adoxq %rsi,%r15
3289 movq %rbx,(%rdi,%rcx,8)
3290 movq %r8,%rbx
3291 adcxq %rsi,%r15
3292
3293 incq %rcx
3294 jnz L$sqrx8x_tail
3295
3296 cmpq 0+8(%rsp),%rbp
3297 jae L$sqrx8x_tail_done
3298
3299 subq 16+8(%rsp),%rsi
3300 movq 48+8(%rsp),%rdx
3301 leaq 64(%rbp),%rbp
3302 adcq 0(%rdi),%r8
3303 adcq 8(%rdi),%r9
3304 adcq 16(%rdi),%r10
3305 adcq 24(%rdi),%r11
3306 adcq 32(%rdi),%r12
3307 adcq 40(%rdi),%r13
3308 adcq 48(%rdi),%r14
3309 adcq 56(%rdi),%r15
3310 leaq 64(%rdi),%rdi
3311 sbbq %rax,%rax
3312 subq $8,%rcx
3313
3314 xorq %rsi,%rsi
3315 movq %rax,16+8(%rsp)
3316 jmp L$sqrx8x_tail
3317
3318.p2align 5
3319L$sqrx8x_tail_done:
3320 xorq %rax,%rax
3321 addq 24+8(%rsp),%r8
3322 adcq $0,%r9
3323 adcq $0,%r10
3324 adcq $0,%r11
3325 adcq $0,%r12
3326 adcq $0,%r13
3327 adcq $0,%r14
3328 adcq $0,%r15
3329 adcq $0,%rax
3330
3331 subq 16+8(%rsp),%rsi
3332L$sqrx8x_no_tail:
3333 adcq 0(%rdi),%r8
3334.byte 102,72,15,126,217
3335 adcq 8(%rdi),%r9
3336 movq 56(%rbp),%rsi
3337.byte 102,72,15,126,213
3338 adcq 16(%rdi),%r10
3339 adcq 24(%rdi),%r11
3340 adcq 32(%rdi),%r12
3341 adcq 40(%rdi),%r13
3342 adcq 48(%rdi),%r14
3343 adcq 56(%rdi),%r15
3344 adcq $0,%rax
3345
3346 movq 32+8(%rsp),%rbx
3347 movq 64(%rdi,%rcx,1),%rdx
3348
3349 movq %r8,0(%rdi)
3350 leaq 64(%rdi),%r8
3351 movq %r9,8(%rdi)
3352 movq %r10,16(%rdi)
3353 movq %r11,24(%rdi)
3354 movq %r12,32(%rdi)
3355 movq %r13,40(%rdi)
3356 movq %r14,48(%rdi)
3357 movq %r15,56(%rdi)
3358
3359 leaq 64(%rdi,%rcx,1),%rdi
3360 cmpq 8+8(%rsp),%r8
3361 jb L$sqrx8x_reduction_loop
3362 ret
3363
3364
3365.p2align 5
3366
3367__bn_postx4x_internal:
3368
3369 movq 0(%rbp),%r12
3370 movq %rcx,%r10
3371 movq %rcx,%r9
3372 negq %rax
3373 sarq $3+2,%rcx
3374
3375.byte 102,72,15,126,202
3376.byte 102,72,15,126,206
3377 decq %r12
3378 movq 8(%rbp),%r13
3379 xorq %r8,%r8
3380 movq 16(%rbp),%r14
3381 movq 24(%rbp),%r15
3382 jmp L$sqrx4x_sub_entry
3383
3384.p2align 4
3385L$sqrx4x_sub:
3386 movq 0(%rbp),%r12
3387 movq 8(%rbp),%r13
3388 movq 16(%rbp),%r14
3389 movq 24(%rbp),%r15
3390L$sqrx4x_sub_entry:
3391 andnq %rax,%r12,%r12
3392 leaq 32(%rbp),%rbp
3393 andnq %rax,%r13,%r13
3394 andnq %rax,%r14,%r14
3395 andnq %rax,%r15,%r15
3396
3397 negq %r8
3398 adcq 0(%rdi),%r12
3399 adcq 8(%rdi),%r13
3400 adcq 16(%rdi),%r14
3401 adcq 24(%rdi),%r15
3402 movq %r12,0(%rdx)
3403 leaq 32(%rdi),%rdi
3404 movq %r13,8(%rdx)
3405 sbbq %r8,%r8
3406 movq %r14,16(%rdx)
3407 movq %r15,24(%rdx)
3408 leaq 32(%rdx),%rdx
3409
3410 incq %rcx
3411 jnz L$sqrx4x_sub
3412
3413 negq %r9
3414
3415 ret
3416
3417
3418.globl _bn_scatter5
3419.private_extern _bn_scatter5
3420
3421.p2align 4
3422_bn_scatter5:
3423
3424_CET_ENDBR
3425 cmpl $0,%esi
3426 jz L$scatter_epilogue
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436 leaq (%rdx,%rcx,8),%rdx
3437L$scatter:
3438 movq (%rdi),%rax
3439 leaq 8(%rdi),%rdi
3440 movq %rax,(%rdx)
3441 leaq 256(%rdx),%rdx
3442 subl $1,%esi
3443 jnz L$scatter
3444L$scatter_epilogue:
3445 ret
3446
3447
3448
3449.globl _bn_gather5
3450.private_extern _bn_gather5
3451
3452.p2align 5
3453_bn_gather5:
3454
3455L$SEH_begin_bn_gather5:
3456_CET_ENDBR
3457
3458.byte 0x4c,0x8d,0x14,0x24
3459
3460.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
3461 leaq L$inc(%rip),%rax
3462 andq $-16,%rsp
3463
3464 movd %ecx,%xmm5
3465 movdqa 0(%rax),%xmm0
3466 movdqa 16(%rax),%xmm1
3467 leaq 128(%rdx),%r11
3468 leaq 128(%rsp),%rax
3469
3470 pshufd $0,%xmm5,%xmm5
3471 movdqa %xmm1,%xmm4
3472 movdqa %xmm1,%xmm2
3473 paddd %xmm0,%xmm1
3474 pcmpeqd %xmm5,%xmm0
3475 movdqa %xmm4,%xmm3
3476
3477 paddd %xmm1,%xmm2
3478 pcmpeqd %xmm5,%xmm1
3479 movdqa %xmm0,-128(%rax)
3480 movdqa %xmm4,%xmm0
3481
3482 paddd %xmm2,%xmm3
3483 pcmpeqd %xmm5,%xmm2
3484 movdqa %xmm1,-112(%rax)
3485 movdqa %xmm4,%xmm1
3486
3487 paddd %xmm3,%xmm0
3488 pcmpeqd %xmm5,%xmm3
3489 movdqa %xmm2,-96(%rax)
3490 movdqa %xmm4,%xmm2
3491 paddd %xmm0,%xmm1
3492 pcmpeqd %xmm5,%xmm0
3493 movdqa %xmm3,-80(%rax)
3494 movdqa %xmm4,%xmm3
3495
3496 paddd %xmm1,%xmm2
3497 pcmpeqd %xmm5,%xmm1
3498 movdqa %xmm0,-64(%rax)
3499 movdqa %xmm4,%xmm0
3500
3501 paddd %xmm2,%xmm3
3502 pcmpeqd %xmm5,%xmm2
3503 movdqa %xmm1,-48(%rax)
3504 movdqa %xmm4,%xmm1
3505
3506 paddd %xmm3,%xmm0
3507 pcmpeqd %xmm5,%xmm3
3508 movdqa %xmm2,-32(%rax)
3509 movdqa %xmm4,%xmm2
3510 paddd %xmm0,%xmm1
3511 pcmpeqd %xmm5,%xmm0
3512 movdqa %xmm3,-16(%rax)
3513 movdqa %xmm4,%xmm3
3514
3515 paddd %xmm1,%xmm2
3516 pcmpeqd %xmm5,%xmm1
3517 movdqa %xmm0,0(%rax)
3518 movdqa %xmm4,%xmm0
3519
3520 paddd %xmm2,%xmm3
3521 pcmpeqd %xmm5,%xmm2
3522 movdqa %xmm1,16(%rax)
3523 movdqa %xmm4,%xmm1
3524
3525 paddd %xmm3,%xmm0
3526 pcmpeqd %xmm5,%xmm3
3527 movdqa %xmm2,32(%rax)
3528 movdqa %xmm4,%xmm2
3529 paddd %xmm0,%xmm1
3530 pcmpeqd %xmm5,%xmm0
3531 movdqa %xmm3,48(%rax)
3532 movdqa %xmm4,%xmm3
3533
3534 paddd %xmm1,%xmm2
3535 pcmpeqd %xmm5,%xmm1
3536 movdqa %xmm0,64(%rax)
3537 movdqa %xmm4,%xmm0
3538
3539 paddd %xmm2,%xmm3
3540 pcmpeqd %xmm5,%xmm2
3541 movdqa %xmm1,80(%rax)
3542 movdqa %xmm4,%xmm1
3543
3544 paddd %xmm3,%xmm0
3545 pcmpeqd %xmm5,%xmm3
3546 movdqa %xmm2,96(%rax)
3547 movdqa %xmm4,%xmm2
3548 movdqa %xmm3,112(%rax)
3549 jmp L$gather
3550
3551.p2align 5
3552L$gather:
3553 pxor %xmm4,%xmm4
3554 pxor %xmm5,%xmm5
3555 movdqa -128(%r11),%xmm0
3556 movdqa -112(%r11),%xmm1
3557 movdqa -96(%r11),%xmm2
3558 pand -128(%rax),%xmm0
3559 movdqa -80(%r11),%xmm3
3560 pand -112(%rax),%xmm1
3561 por %xmm0,%xmm4
3562 pand -96(%rax),%xmm2
3563 por %xmm1,%xmm5
3564 pand -80(%rax),%xmm3
3565 por %xmm2,%xmm4
3566 por %xmm3,%xmm5
3567 movdqa -64(%r11),%xmm0
3568 movdqa -48(%r11),%xmm1
3569 movdqa -32(%r11),%xmm2
3570 pand -64(%rax),%xmm0
3571 movdqa -16(%r11),%xmm3
3572 pand -48(%rax),%xmm1
3573 por %xmm0,%xmm4
3574 pand -32(%rax),%xmm2
3575 por %xmm1,%xmm5
3576 pand -16(%rax),%xmm3
3577 por %xmm2,%xmm4
3578 por %xmm3,%xmm5
3579 movdqa 0(%r11),%xmm0
3580 movdqa 16(%r11),%xmm1
3581 movdqa 32(%r11),%xmm2
3582 pand 0(%rax),%xmm0
3583 movdqa 48(%r11),%xmm3
3584 pand 16(%rax),%xmm1
3585 por %xmm0,%xmm4
3586 pand 32(%rax),%xmm2
3587 por %xmm1,%xmm5
3588 pand 48(%rax),%xmm3
3589 por %xmm2,%xmm4
3590 por %xmm3,%xmm5
3591 movdqa 64(%r11),%xmm0
3592 movdqa 80(%r11),%xmm1
3593 movdqa 96(%r11),%xmm2
3594 pand 64(%rax),%xmm0
3595 movdqa 112(%r11),%xmm3
3596 pand 80(%rax),%xmm1
3597 por %xmm0,%xmm4
3598 pand 96(%rax),%xmm2
3599 por %xmm1,%xmm5
3600 pand 112(%rax),%xmm3
3601 por %xmm2,%xmm4
3602 por %xmm3,%xmm5
3603 por %xmm5,%xmm4
3604 leaq 256(%r11),%r11
3605
3606 pshufd $0x4e,%xmm4,%xmm0
3607 por %xmm4,%xmm0
3608 movq %xmm0,(%rdi)
3609 leaq 8(%rdi),%rdi
3610 subl $1,%esi
3611 jnz L$gather
3612
3613 leaq (%r10),%rsp
3614
3615 ret
3616L$SEH_end_bn_gather5:
3617
3618
3619.section __DATA,__const
3620.p2align 6
3621L$inc:
3622.long 0,0, 1,1
3623.long 2,2, 2,2
3624.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3625.text
3626#endif