blob: 3b12405d0de888fef2ea0eda32985d4268e8917e [file] [log] [blame]
David Benjaminfe0c91e2024-03-18 15:37:24 +10001; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4%ifidn __OUTPUT_FORMAT__, win64
5default rel
6%define XMMWORD
7%define YMMWORD
8%define ZMMWORD
9%define _CET_ENDBR
10
11%ifdef BORINGSSL_PREFIX
12%include "boringssl_prefix_symbols_nasm.inc"
13%endif
14section .text code align=64
15
16
David Benjamin3efe2eb2024-05-08 22:24:27 -070017global bn_mul_mont_gather5_nohw
David Benjaminfe0c91e2024-03-18 15:37:24 +100018
19ALIGN 64
David Benjamin3efe2eb2024-05-08 22:24:27 -070020bn_mul_mont_gather5_nohw:
David Benjaminfe0c91e2024-03-18 15:37:24 +100021 mov QWORD[8+rsp],rdi ;WIN64 prologue
22 mov QWORD[16+rsp],rsi
23 mov rax,rsp
David Benjamin3efe2eb2024-05-08 22:24:27 -070024$L$SEH_begin_bn_mul_mont_gather5_nohw:
David Benjaminfe0c91e2024-03-18 15:37:24 +100025 mov rdi,rcx
26 mov rsi,rdx
27 mov rdx,r8
28 mov rcx,r9
29 mov r8,QWORD[40+rsp]
30 mov r9,QWORD[48+rsp]
31
32
33
34_CET_ENDBR
David Benjamin3efe2eb2024-05-08 22:24:27 -070035
36
David Benjaminfe0c91e2024-03-18 15:37:24 +100037 mov r9d,r9d
38 mov rax,rsp
39
David Benjaminfe0c91e2024-03-18 15:37:24 +100040 movd xmm5,DWORD[56+rsp]
41 push rbx
42
43 push rbp
44
45 push r12
46
47 push r13
48
49 push r14
50
51 push r15
52
53
54 neg r9
55 mov r11,rsp
56 lea r10,[((-280))+r9*8+rsp]
57 neg r9
58 and r10,-1024
59
60
61
62
63
64
65
66
67
68 sub r11,r10
69 and r11,-4096
70 lea rsp,[r11*1+r10]
71 mov r11,QWORD[rsp]
72 cmp rsp,r10
73 ja NEAR $L$mul_page_walk
74 jmp NEAR $L$mul_page_walk_done
75
76$L$mul_page_walk:
77 lea rsp,[((-4096))+rsp]
78 mov r11,QWORD[rsp]
79 cmp rsp,r10
80 ja NEAR $L$mul_page_walk
81$L$mul_page_walk_done:
82
83 lea r10,[$L$inc]
84 mov QWORD[8+r9*8+rsp],rax
85
86$L$mul_body:
87
88 lea r12,[128+rdx]
89 movdqa xmm0,XMMWORD[r10]
90 movdqa xmm1,XMMWORD[16+r10]
91 lea r10,[((24-112))+r9*8+rsp]
92 and r10,-16
93
94 pshufd xmm5,xmm5,0
95 movdqa xmm4,xmm1
96 movdqa xmm2,xmm1
97 paddd xmm1,xmm0
98 pcmpeqd xmm0,xmm5
99 DB 0x67
100 movdqa xmm3,xmm4
101 paddd xmm2,xmm1
102 pcmpeqd xmm1,xmm5
103 movdqa XMMWORD[112+r10],xmm0
104 movdqa xmm0,xmm4
105
106 paddd xmm3,xmm2
107 pcmpeqd xmm2,xmm5
108 movdqa XMMWORD[128+r10],xmm1
109 movdqa xmm1,xmm4
110
111 paddd xmm0,xmm3
112 pcmpeqd xmm3,xmm5
113 movdqa XMMWORD[144+r10],xmm2
114 movdqa xmm2,xmm4
115
116 paddd xmm1,xmm0
117 pcmpeqd xmm0,xmm5
118 movdqa XMMWORD[160+r10],xmm3
119 movdqa xmm3,xmm4
120 paddd xmm2,xmm1
121 pcmpeqd xmm1,xmm5
122 movdqa XMMWORD[176+r10],xmm0
123 movdqa xmm0,xmm4
124
125 paddd xmm3,xmm2
126 pcmpeqd xmm2,xmm5
127 movdqa XMMWORD[192+r10],xmm1
128 movdqa xmm1,xmm4
129
130 paddd xmm0,xmm3
131 pcmpeqd xmm3,xmm5
132 movdqa XMMWORD[208+r10],xmm2
133 movdqa xmm2,xmm4
134
135 paddd xmm1,xmm0
136 pcmpeqd xmm0,xmm5
137 movdqa XMMWORD[224+r10],xmm3
138 movdqa xmm3,xmm4
139 paddd xmm2,xmm1
140 pcmpeqd xmm1,xmm5
141 movdqa XMMWORD[240+r10],xmm0
142 movdqa xmm0,xmm4
143
144 paddd xmm3,xmm2
145 pcmpeqd xmm2,xmm5
146 movdqa XMMWORD[256+r10],xmm1
147 movdqa xmm1,xmm4
148
149 paddd xmm0,xmm3
150 pcmpeqd xmm3,xmm5
151 movdqa XMMWORD[272+r10],xmm2
152 movdqa xmm2,xmm4
153
154 paddd xmm1,xmm0
155 pcmpeqd xmm0,xmm5
156 movdqa XMMWORD[288+r10],xmm3
157 movdqa xmm3,xmm4
158 paddd xmm2,xmm1
159 pcmpeqd xmm1,xmm5
160 movdqa XMMWORD[304+r10],xmm0
161
162 paddd xmm3,xmm2
163 DB 0x67
164 pcmpeqd xmm2,xmm5
165 movdqa XMMWORD[320+r10],xmm1
166
167 pcmpeqd xmm3,xmm5
168 movdqa XMMWORD[336+r10],xmm2
169 pand xmm0,XMMWORD[64+r12]
170
171 pand xmm1,XMMWORD[80+r12]
172 pand xmm2,XMMWORD[96+r12]
173 movdqa XMMWORD[352+r10],xmm3
174 pand xmm3,XMMWORD[112+r12]
175 por xmm0,xmm2
176 por xmm1,xmm3
177 movdqa xmm4,XMMWORD[((-128))+r12]
178 movdqa xmm5,XMMWORD[((-112))+r12]
179 movdqa xmm2,XMMWORD[((-96))+r12]
180 pand xmm4,XMMWORD[112+r10]
181 movdqa xmm3,XMMWORD[((-80))+r12]
182 pand xmm5,XMMWORD[128+r10]
183 por xmm0,xmm4
184 pand xmm2,XMMWORD[144+r10]
185 por xmm1,xmm5
186 pand xmm3,XMMWORD[160+r10]
187 por xmm0,xmm2
188 por xmm1,xmm3
189 movdqa xmm4,XMMWORD[((-64))+r12]
190 movdqa xmm5,XMMWORD[((-48))+r12]
191 movdqa xmm2,XMMWORD[((-32))+r12]
192 pand xmm4,XMMWORD[176+r10]
193 movdqa xmm3,XMMWORD[((-16))+r12]
194 pand xmm5,XMMWORD[192+r10]
195 por xmm0,xmm4
196 pand xmm2,XMMWORD[208+r10]
197 por xmm1,xmm5
198 pand xmm3,XMMWORD[224+r10]
199 por xmm0,xmm2
200 por xmm1,xmm3
201 movdqa xmm4,XMMWORD[r12]
202 movdqa xmm5,XMMWORD[16+r12]
203 movdqa xmm2,XMMWORD[32+r12]
204 pand xmm4,XMMWORD[240+r10]
205 movdqa xmm3,XMMWORD[48+r12]
206 pand xmm5,XMMWORD[256+r10]
207 por xmm0,xmm4
208 pand xmm2,XMMWORD[272+r10]
209 por xmm1,xmm5
210 pand xmm3,XMMWORD[288+r10]
211 por xmm0,xmm2
212 por xmm1,xmm3
213 por xmm0,xmm1
214
215 pshufd xmm1,xmm0,0x4e
216 por xmm0,xmm1
217 lea r12,[256+r12]
218DB 102,72,15,126,195
219
220 mov r8,QWORD[r8]
221 mov rax,QWORD[rsi]
222
223 xor r14,r14
224 xor r15,r15
225
226 mov rbp,r8
227 mul rbx
228 mov r10,rax
229 mov rax,QWORD[rcx]
230
231 imul rbp,r10
232 mov r11,rdx
233
234 mul rbp
235 add r10,rax
236 mov rax,QWORD[8+rsi]
237 adc rdx,0
238 mov r13,rdx
239
240 lea r15,[1+r15]
241 jmp NEAR $L$1st_enter
242
243ALIGN 16
244$L$1st:
245 add r13,rax
246 mov rax,QWORD[r15*8+rsi]
247 adc rdx,0
248 add r13,r11
249 mov r11,r10
250 adc rdx,0
251 mov QWORD[((-16))+r15*8+rsp],r13
252 mov r13,rdx
253
254$L$1st_enter:
255 mul rbx
256 add r11,rax
257 mov rax,QWORD[r15*8+rcx]
258 adc rdx,0
259 lea r15,[1+r15]
260 mov r10,rdx
261
262 mul rbp
263 cmp r15,r9
264 jne NEAR $L$1st
265
266
267 add r13,rax
268 adc rdx,0
269 add r13,r11
270 adc rdx,0
271 mov QWORD[((-16))+r9*8+rsp],r13
272 mov r13,rdx
273 mov r11,r10
274
275 xor rdx,rdx
276 add r13,r11
277 adc rdx,0
278 mov QWORD[((-8))+r9*8+rsp],r13
279 mov QWORD[r9*8+rsp],rdx
280
281 lea r14,[1+r14]
282 jmp NEAR $L$outer
283ALIGN 16
284$L$outer:
285 lea rdx,[((24+128))+r9*8+rsp]
286 and rdx,-16
287 pxor xmm4,xmm4
288 pxor xmm5,xmm5
289 movdqa xmm0,XMMWORD[((-128))+r12]
290 movdqa xmm1,XMMWORD[((-112))+r12]
291 movdqa xmm2,XMMWORD[((-96))+r12]
292 movdqa xmm3,XMMWORD[((-80))+r12]
293 pand xmm0,XMMWORD[((-128))+rdx]
294 pand xmm1,XMMWORD[((-112))+rdx]
295 por xmm4,xmm0
296 pand xmm2,XMMWORD[((-96))+rdx]
297 por xmm5,xmm1
298 pand xmm3,XMMWORD[((-80))+rdx]
299 por xmm4,xmm2
300 por xmm5,xmm3
301 movdqa xmm0,XMMWORD[((-64))+r12]
302 movdqa xmm1,XMMWORD[((-48))+r12]
303 movdqa xmm2,XMMWORD[((-32))+r12]
304 movdqa xmm3,XMMWORD[((-16))+r12]
305 pand xmm0,XMMWORD[((-64))+rdx]
306 pand xmm1,XMMWORD[((-48))+rdx]
307 por xmm4,xmm0
308 pand xmm2,XMMWORD[((-32))+rdx]
309 por xmm5,xmm1
310 pand xmm3,XMMWORD[((-16))+rdx]
311 por xmm4,xmm2
312 por xmm5,xmm3
313 movdqa xmm0,XMMWORD[r12]
314 movdqa xmm1,XMMWORD[16+r12]
315 movdqa xmm2,XMMWORD[32+r12]
316 movdqa xmm3,XMMWORD[48+r12]
317 pand xmm0,XMMWORD[rdx]
318 pand xmm1,XMMWORD[16+rdx]
319 por xmm4,xmm0
320 pand xmm2,XMMWORD[32+rdx]
321 por xmm5,xmm1
322 pand xmm3,XMMWORD[48+rdx]
323 por xmm4,xmm2
324 por xmm5,xmm3
325 movdqa xmm0,XMMWORD[64+r12]
326 movdqa xmm1,XMMWORD[80+r12]
327 movdqa xmm2,XMMWORD[96+r12]
328 movdqa xmm3,XMMWORD[112+r12]
329 pand xmm0,XMMWORD[64+rdx]
330 pand xmm1,XMMWORD[80+rdx]
331 por xmm4,xmm0
332 pand xmm2,XMMWORD[96+rdx]
333 por xmm5,xmm1
334 pand xmm3,XMMWORD[112+rdx]
335 por xmm4,xmm2
336 por xmm5,xmm3
337 por xmm4,xmm5
338
339 pshufd xmm0,xmm4,0x4e
340 por xmm0,xmm4
341 lea r12,[256+r12]
342
343 mov rax,QWORD[rsi]
344DB 102,72,15,126,195
345
346 xor r15,r15
347 mov rbp,r8
348 mov r10,QWORD[rsp]
349
350 mul rbx
351 add r10,rax
352 mov rax,QWORD[rcx]
353 adc rdx,0
354
355 imul rbp,r10
356 mov r11,rdx
357
358 mul rbp
359 add r10,rax
360 mov rax,QWORD[8+rsi]
361 adc rdx,0
362 mov r10,QWORD[8+rsp]
363 mov r13,rdx
364
365 lea r15,[1+r15]
366 jmp NEAR $L$inner_enter
367
368ALIGN 16
369$L$inner:
370 add r13,rax
371 mov rax,QWORD[r15*8+rsi]
372 adc rdx,0
373 add r13,r10
374 mov r10,QWORD[r15*8+rsp]
375 adc rdx,0
376 mov QWORD[((-16))+r15*8+rsp],r13
377 mov r13,rdx
378
379$L$inner_enter:
380 mul rbx
381 add r11,rax
382 mov rax,QWORD[r15*8+rcx]
383 adc rdx,0
384 add r10,r11
385 mov r11,rdx
386 adc r11,0
387 lea r15,[1+r15]
388
389 mul rbp
390 cmp r15,r9
391 jne NEAR $L$inner
392
393 add r13,rax
394 adc rdx,0
395 add r13,r10
396 mov r10,QWORD[r9*8+rsp]
397 adc rdx,0
398 mov QWORD[((-16))+r9*8+rsp],r13
399 mov r13,rdx
400
401 xor rdx,rdx
402 add r13,r11
403 adc rdx,0
404 add r13,r10
405 adc rdx,0
406 mov QWORD[((-8))+r9*8+rsp],r13
407 mov QWORD[r9*8+rsp],rdx
408
409 lea r14,[1+r14]
410 cmp r14,r9
411 jb NEAR $L$outer
412
413 xor r14,r14
414 mov rax,QWORD[rsp]
415 lea rsi,[rsp]
416 mov r15,r9
417 jmp NEAR $L$sub
418ALIGN 16
419$L$sub: sbb rax,QWORD[r14*8+rcx]
420 mov QWORD[r14*8+rdi],rax
421 mov rax,QWORD[8+r14*8+rsi]
422 lea r14,[1+r14]
423 dec r15
424 jnz NEAR $L$sub
425
426 sbb rax,0
427 mov rbx,-1
428 xor rbx,rax
429 xor r14,r14
430 mov r15,r9
431
432$L$copy:
433 mov rcx,QWORD[r14*8+rdi]
434 mov rdx,QWORD[r14*8+rsp]
435 and rcx,rbx
436 and rdx,rax
437 mov QWORD[r14*8+rsp],r14
438 or rdx,rcx
439 mov QWORD[r14*8+rdi],rdx
440 lea r14,[1+r14]
441 sub r15,1
442 jnz NEAR $L$copy
443
444 mov rsi,QWORD[8+r9*8+rsp]
445
446 mov rax,1
447
448 mov r15,QWORD[((-48))+rsi]
449
450 mov r14,QWORD[((-40))+rsi]
451
452 mov r13,QWORD[((-32))+rsi]
453
454 mov r12,QWORD[((-24))+rsi]
455
456 mov rbp,QWORD[((-16))+rsi]
457
458 mov rbx,QWORD[((-8))+rsi]
459
460 lea rsp,[rsi]
461
462$L$mul_epilogue:
463 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
464 mov rsi,QWORD[16+rsp]
465 ret
466
David Benjamin3efe2eb2024-05-08 22:24:27 -0700467$L$SEH_end_bn_mul_mont_gather5_nohw:
468global bn_mul4x_mont_gather5
David Benjaminfe0c91e2024-03-18 15:37:24 +1000469
470ALIGN 32
471bn_mul4x_mont_gather5:
472 mov QWORD[8+rsp],rdi ;WIN64 prologue
473 mov QWORD[16+rsp],rsi
474 mov rax,rsp
475$L$SEH_begin_bn_mul4x_mont_gather5:
476 mov rdi,rcx
477 mov rsi,rdx
478 mov rdx,r8
479 mov rcx,r9
480 mov r8,QWORD[40+rsp]
481 mov r9,QWORD[48+rsp]
482
483
484
David Benjamin3efe2eb2024-05-08 22:24:27 -0700485_CET_ENDBR
David Benjaminfe0c91e2024-03-18 15:37:24 +1000486 DB 0x67
487 mov rax,rsp
488
David Benjaminfe0c91e2024-03-18 15:37:24 +1000489 push rbx
490
491 push rbp
492
493 push r12
494
495 push r13
496
497 push r14
498
499 push r15
500
501$L$mul4x_prologue:
502
503 DB 0x67
David Benjamin3efe2eb2024-05-08 22:24:27 -0700504
505
506
David Benjaminfe0c91e2024-03-18 15:37:24 +1000507 shl r9d,3
508 lea r10,[r9*2+r9]
509 neg r9
510
511
512
513
514
515
516
517
518
519
520 lea r11,[((-320))+r9*2+rsp]
521 mov rbp,rsp
522 sub r11,rdi
523 and r11,4095
524 cmp r10,r11
525 jb NEAR $L$mul4xsp_alt
526 sub rbp,r11
527 lea rbp,[((-320))+r9*2+rbp]
528 jmp NEAR $L$mul4xsp_done
529
530ALIGN 32
531$L$mul4xsp_alt:
532 lea r10,[((4096-320))+r9*2]
533 lea rbp,[((-320))+r9*2+rbp]
534 sub r11,r10
535 mov r10,0
536 cmovc r11,r10
537 sub rbp,r11
538$L$mul4xsp_done:
539 and rbp,-64
540 mov r11,rsp
541 sub r11,rbp
542 and r11,-4096
543 lea rsp,[rbp*1+r11]
544 mov r10,QWORD[rsp]
545 cmp rsp,rbp
546 ja NEAR $L$mul4x_page_walk
547 jmp NEAR $L$mul4x_page_walk_done
548
549$L$mul4x_page_walk:
550 lea rsp,[((-4096))+rsp]
551 mov r10,QWORD[rsp]
552 cmp rsp,rbp
553 ja NEAR $L$mul4x_page_walk
554$L$mul4x_page_walk_done:
555
556 neg r9
557
558 mov QWORD[40+rsp],rax
559
560$L$mul4x_body:
561
562 call mul4x_internal
563
564 mov rsi,QWORD[40+rsp]
565
566 mov rax,1
567
568 mov r15,QWORD[((-48))+rsi]
569
570 mov r14,QWORD[((-40))+rsi]
571
572 mov r13,QWORD[((-32))+rsi]
573
574 mov r12,QWORD[((-24))+rsi]
575
576 mov rbp,QWORD[((-16))+rsi]
577
578 mov rbx,QWORD[((-8))+rsi]
579
580 lea rsp,[rsi]
581
582$L$mul4x_epilogue:
583 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
584 mov rsi,QWORD[16+rsp]
585 ret
586
587$L$SEH_end_bn_mul4x_mont_gather5:
588
589
590ALIGN 32
591mul4x_internal:
592
593 shl r9,5
594 movd xmm5,DWORD[56+rax]
595 lea rax,[$L$inc]
596 lea r13,[128+r9*1+rdx]
597 shr r9,5
598 movdqa xmm0,XMMWORD[rax]
599 movdqa xmm1,XMMWORD[16+rax]
600 lea r10,[((88-112))+r9*1+rsp]
601 lea r12,[128+rdx]
602
603 pshufd xmm5,xmm5,0
604 movdqa xmm4,xmm1
605 DB 0x67,0x67
606 movdqa xmm2,xmm1
607 paddd xmm1,xmm0
608 pcmpeqd xmm0,xmm5
609 DB 0x67
610 movdqa xmm3,xmm4
611 paddd xmm2,xmm1
612 pcmpeqd xmm1,xmm5
613 movdqa XMMWORD[112+r10],xmm0
614 movdqa xmm0,xmm4
615
616 paddd xmm3,xmm2
617 pcmpeqd xmm2,xmm5
618 movdqa XMMWORD[128+r10],xmm1
619 movdqa xmm1,xmm4
620
621 paddd xmm0,xmm3
622 pcmpeqd xmm3,xmm5
623 movdqa XMMWORD[144+r10],xmm2
624 movdqa xmm2,xmm4
625
626 paddd xmm1,xmm0
627 pcmpeqd xmm0,xmm5
628 movdqa XMMWORD[160+r10],xmm3
629 movdqa xmm3,xmm4
630 paddd xmm2,xmm1
631 pcmpeqd xmm1,xmm5
632 movdqa XMMWORD[176+r10],xmm0
633 movdqa xmm0,xmm4
634
635 paddd xmm3,xmm2
636 pcmpeqd xmm2,xmm5
637 movdqa XMMWORD[192+r10],xmm1
638 movdqa xmm1,xmm4
639
640 paddd xmm0,xmm3
641 pcmpeqd xmm3,xmm5
642 movdqa XMMWORD[208+r10],xmm2
643 movdqa xmm2,xmm4
644
645 paddd xmm1,xmm0
646 pcmpeqd xmm0,xmm5
647 movdqa XMMWORD[224+r10],xmm3
648 movdqa xmm3,xmm4
649 paddd xmm2,xmm1
650 pcmpeqd xmm1,xmm5
651 movdqa XMMWORD[240+r10],xmm0
652 movdqa xmm0,xmm4
653
654 paddd xmm3,xmm2
655 pcmpeqd xmm2,xmm5
656 movdqa XMMWORD[256+r10],xmm1
657 movdqa xmm1,xmm4
658
659 paddd xmm0,xmm3
660 pcmpeqd xmm3,xmm5
661 movdqa XMMWORD[272+r10],xmm2
662 movdqa xmm2,xmm4
663
664 paddd xmm1,xmm0
665 pcmpeqd xmm0,xmm5
666 movdqa XMMWORD[288+r10],xmm3
667 movdqa xmm3,xmm4
668 paddd xmm2,xmm1
669 pcmpeqd xmm1,xmm5
670 movdqa XMMWORD[304+r10],xmm0
671
672 paddd xmm3,xmm2
673 DB 0x67
674 pcmpeqd xmm2,xmm5
675 movdqa XMMWORD[320+r10],xmm1
676
677 pcmpeqd xmm3,xmm5
678 movdqa XMMWORD[336+r10],xmm2
679 pand xmm0,XMMWORD[64+r12]
680
681 pand xmm1,XMMWORD[80+r12]
682 pand xmm2,XMMWORD[96+r12]
683 movdqa XMMWORD[352+r10],xmm3
684 pand xmm3,XMMWORD[112+r12]
685 por xmm0,xmm2
686 por xmm1,xmm3
687 movdqa xmm4,XMMWORD[((-128))+r12]
688 movdqa xmm5,XMMWORD[((-112))+r12]
689 movdqa xmm2,XMMWORD[((-96))+r12]
690 pand xmm4,XMMWORD[112+r10]
691 movdqa xmm3,XMMWORD[((-80))+r12]
692 pand xmm5,XMMWORD[128+r10]
693 por xmm0,xmm4
694 pand xmm2,XMMWORD[144+r10]
695 por xmm1,xmm5
696 pand xmm3,XMMWORD[160+r10]
697 por xmm0,xmm2
698 por xmm1,xmm3
699 movdqa xmm4,XMMWORD[((-64))+r12]
700 movdqa xmm5,XMMWORD[((-48))+r12]
701 movdqa xmm2,XMMWORD[((-32))+r12]
702 pand xmm4,XMMWORD[176+r10]
703 movdqa xmm3,XMMWORD[((-16))+r12]
704 pand xmm5,XMMWORD[192+r10]
705 por xmm0,xmm4
706 pand xmm2,XMMWORD[208+r10]
707 por xmm1,xmm5
708 pand xmm3,XMMWORD[224+r10]
709 por xmm0,xmm2
710 por xmm1,xmm3
711 movdqa xmm4,XMMWORD[r12]
712 movdqa xmm5,XMMWORD[16+r12]
713 movdqa xmm2,XMMWORD[32+r12]
714 pand xmm4,XMMWORD[240+r10]
715 movdqa xmm3,XMMWORD[48+r12]
716 pand xmm5,XMMWORD[256+r10]
717 por xmm0,xmm4
718 pand xmm2,XMMWORD[272+r10]
719 por xmm1,xmm5
720 pand xmm3,XMMWORD[288+r10]
721 por xmm0,xmm2
722 por xmm1,xmm3
723 por xmm0,xmm1
724
725 pshufd xmm1,xmm0,0x4e
726 por xmm0,xmm1
727 lea r12,[256+r12]
728DB 102,72,15,126,195
729
730 mov QWORD[((16+8))+rsp],r13
731 mov QWORD[((56+8))+rsp],rdi
732
733 mov r8,QWORD[r8]
734 mov rax,QWORD[rsi]
735 lea rsi,[r9*1+rsi]
736 neg r9
737
738 mov rbp,r8
739 mul rbx
740 mov r10,rax
741 mov rax,QWORD[rcx]
742
743 imul rbp,r10
744 lea r14,[((64+8))+rsp]
745 mov r11,rdx
746
747 mul rbp
748 add r10,rax
749 mov rax,QWORD[8+r9*1+rsi]
750 adc rdx,0
751 mov rdi,rdx
752
753 mul rbx
754 add r11,rax
755 mov rax,QWORD[8+rcx]
756 adc rdx,0
757 mov r10,rdx
758
759 mul rbp
760 add rdi,rax
761 mov rax,QWORD[16+r9*1+rsi]
762 adc rdx,0
763 add rdi,r11
764 lea r15,[32+r9]
765 lea rcx,[32+rcx]
766 adc rdx,0
767 mov QWORD[r14],rdi
768 mov r13,rdx
769 jmp NEAR $L$1st4x
770
771ALIGN 32
772$L$1st4x:
773 mul rbx
774 add r10,rax
775 mov rax,QWORD[((-16))+rcx]
776 lea r14,[32+r14]
777 adc rdx,0
778 mov r11,rdx
779
780 mul rbp
781 add r13,rax
782 mov rax,QWORD[((-8))+r15*1+rsi]
783 adc rdx,0
784 add r13,r10
785 adc rdx,0
786 mov QWORD[((-24))+r14],r13
787 mov rdi,rdx
788
789 mul rbx
790 add r11,rax
791 mov rax,QWORD[((-8))+rcx]
792 adc rdx,0
793 mov r10,rdx
794
795 mul rbp
796 add rdi,rax
797 mov rax,QWORD[r15*1+rsi]
798 adc rdx,0
799 add rdi,r11
800 adc rdx,0
801 mov QWORD[((-16))+r14],rdi
802 mov r13,rdx
803
804 mul rbx
805 add r10,rax
806 mov rax,QWORD[rcx]
807 adc rdx,0
808 mov r11,rdx
809
810 mul rbp
811 add r13,rax
812 mov rax,QWORD[8+r15*1+rsi]
813 adc rdx,0
814 add r13,r10
815 adc rdx,0
816 mov QWORD[((-8))+r14],r13
817 mov rdi,rdx
818
819 mul rbx
820 add r11,rax
821 mov rax,QWORD[8+rcx]
822 adc rdx,0
823 mov r10,rdx
824
825 mul rbp
826 add rdi,rax
827 mov rax,QWORD[16+r15*1+rsi]
828 adc rdx,0
829 add rdi,r11
830 lea rcx,[32+rcx]
831 adc rdx,0
832 mov QWORD[r14],rdi
833 mov r13,rdx
834
835 add r15,32
836 jnz NEAR $L$1st4x
837
838 mul rbx
839 add r10,rax
840 mov rax,QWORD[((-16))+rcx]
841 lea r14,[32+r14]
842 adc rdx,0
843 mov r11,rdx
844
845 mul rbp
846 add r13,rax
847 mov rax,QWORD[((-8))+rsi]
848 adc rdx,0
849 add r13,r10
850 adc rdx,0
851 mov QWORD[((-24))+r14],r13
852 mov rdi,rdx
853
854 mul rbx
855 add r11,rax
856 mov rax,QWORD[((-8))+rcx]
857 adc rdx,0
858 mov r10,rdx
859
860 mul rbp
861 add rdi,rax
862 mov rax,QWORD[r9*1+rsi]
863 adc rdx,0
864 add rdi,r11
865 adc rdx,0
866 mov QWORD[((-16))+r14],rdi
867 mov r13,rdx
868
869 lea rcx,[r9*1+rcx]
870
871 xor rdi,rdi
872 add r13,r10
873 adc rdi,0
874 mov QWORD[((-8))+r14],r13
875
876 jmp NEAR $L$outer4x
877
878ALIGN 32
879$L$outer4x:
880 lea rdx,[((16+128))+r14]
881 pxor xmm4,xmm4
882 pxor xmm5,xmm5
883 movdqa xmm0,XMMWORD[((-128))+r12]
884 movdqa xmm1,XMMWORD[((-112))+r12]
885 movdqa xmm2,XMMWORD[((-96))+r12]
886 movdqa xmm3,XMMWORD[((-80))+r12]
887 pand xmm0,XMMWORD[((-128))+rdx]
888 pand xmm1,XMMWORD[((-112))+rdx]
889 por xmm4,xmm0
890 pand xmm2,XMMWORD[((-96))+rdx]
891 por xmm5,xmm1
892 pand xmm3,XMMWORD[((-80))+rdx]
893 por xmm4,xmm2
894 por xmm5,xmm3
895 movdqa xmm0,XMMWORD[((-64))+r12]
896 movdqa xmm1,XMMWORD[((-48))+r12]
897 movdqa xmm2,XMMWORD[((-32))+r12]
898 movdqa xmm3,XMMWORD[((-16))+r12]
899 pand xmm0,XMMWORD[((-64))+rdx]
900 pand xmm1,XMMWORD[((-48))+rdx]
901 por xmm4,xmm0
902 pand xmm2,XMMWORD[((-32))+rdx]
903 por xmm5,xmm1
904 pand xmm3,XMMWORD[((-16))+rdx]
905 por xmm4,xmm2
906 por xmm5,xmm3
907 movdqa xmm0,XMMWORD[r12]
908 movdqa xmm1,XMMWORD[16+r12]
909 movdqa xmm2,XMMWORD[32+r12]
910 movdqa xmm3,XMMWORD[48+r12]
911 pand xmm0,XMMWORD[rdx]
912 pand xmm1,XMMWORD[16+rdx]
913 por xmm4,xmm0
914 pand xmm2,XMMWORD[32+rdx]
915 por xmm5,xmm1
916 pand xmm3,XMMWORD[48+rdx]
917 por xmm4,xmm2
918 por xmm5,xmm3
919 movdqa xmm0,XMMWORD[64+r12]
920 movdqa xmm1,XMMWORD[80+r12]
921 movdqa xmm2,XMMWORD[96+r12]
922 movdqa xmm3,XMMWORD[112+r12]
923 pand xmm0,XMMWORD[64+rdx]
924 pand xmm1,XMMWORD[80+rdx]
925 por xmm4,xmm0
926 pand xmm2,XMMWORD[96+rdx]
927 por xmm5,xmm1
928 pand xmm3,XMMWORD[112+rdx]
929 por xmm4,xmm2
930 por xmm5,xmm3
931 por xmm4,xmm5
932
933 pshufd xmm0,xmm4,0x4e
934 por xmm0,xmm4
935 lea r12,[256+r12]
936DB 102,72,15,126,195
937
938 mov r10,QWORD[r9*1+r14]
939 mov rbp,r8
940 mul rbx
941 add r10,rax
942 mov rax,QWORD[rcx]
943 adc rdx,0
944
945 imul rbp,r10
946 mov r11,rdx
947 mov QWORD[r14],rdi
948
949 lea r14,[r9*1+r14]
950
951 mul rbp
952 add r10,rax
953 mov rax,QWORD[8+r9*1+rsi]
954 adc rdx,0
955 mov rdi,rdx
956
957 mul rbx
958 add r11,rax
959 mov rax,QWORD[8+rcx]
960 adc rdx,0
961 add r11,QWORD[8+r14]
962 adc rdx,0
963 mov r10,rdx
964
965 mul rbp
966 add rdi,rax
967 mov rax,QWORD[16+r9*1+rsi]
968 adc rdx,0
969 add rdi,r11
970 lea r15,[32+r9]
971 lea rcx,[32+rcx]
972 adc rdx,0
973 mov r13,rdx
974 jmp NEAR $L$inner4x
975
976ALIGN 32
977$L$inner4x:
978 mul rbx
979 add r10,rax
980 mov rax,QWORD[((-16))+rcx]
981 adc rdx,0
982 add r10,QWORD[16+r14]
983 lea r14,[32+r14]
984 adc rdx,0
985 mov r11,rdx
986
987 mul rbp
988 add r13,rax
989 mov rax,QWORD[((-8))+r15*1+rsi]
990 adc rdx,0
991 add r13,r10
992 adc rdx,0
993 mov QWORD[((-32))+r14],rdi
994 mov rdi,rdx
995
996 mul rbx
997 add r11,rax
998 mov rax,QWORD[((-8))+rcx]
999 adc rdx,0
1000 add r11,QWORD[((-8))+r14]
1001 adc rdx,0
1002 mov r10,rdx
1003
1004 mul rbp
1005 add rdi,rax
1006 mov rax,QWORD[r15*1+rsi]
1007 adc rdx,0
1008 add rdi,r11
1009 adc rdx,0
1010 mov QWORD[((-24))+r14],r13
1011 mov r13,rdx
1012
1013 mul rbx
1014 add r10,rax
1015 mov rax,QWORD[rcx]
1016 adc rdx,0
1017 add r10,QWORD[r14]
1018 adc rdx,0
1019 mov r11,rdx
1020
1021 mul rbp
1022 add r13,rax
1023 mov rax,QWORD[8+r15*1+rsi]
1024 adc rdx,0
1025 add r13,r10
1026 adc rdx,0
1027 mov QWORD[((-16))+r14],rdi
1028 mov rdi,rdx
1029
1030 mul rbx
1031 add r11,rax
1032 mov rax,QWORD[8+rcx]
1033 adc rdx,0
1034 add r11,QWORD[8+r14]
1035 adc rdx,0
1036 mov r10,rdx
1037
1038 mul rbp
1039 add rdi,rax
1040 mov rax,QWORD[16+r15*1+rsi]
1041 adc rdx,0
1042 add rdi,r11
1043 lea rcx,[32+rcx]
1044 adc rdx,0
1045 mov QWORD[((-8))+r14],r13
1046 mov r13,rdx
1047
1048 add r15,32
1049 jnz NEAR $L$inner4x
1050
1051 mul rbx
1052 add r10,rax
1053 mov rax,QWORD[((-16))+rcx]
1054 adc rdx,0
1055 add r10,QWORD[16+r14]
1056 lea r14,[32+r14]
1057 adc rdx,0
1058 mov r11,rdx
1059
1060 mul rbp
1061 add r13,rax
1062 mov rax,QWORD[((-8))+rsi]
1063 adc rdx,0
1064 add r13,r10
1065 adc rdx,0
1066 mov QWORD[((-32))+r14],rdi
1067 mov rdi,rdx
1068
1069 mul rbx
1070 add r11,rax
1071 mov rax,rbp
1072 mov rbp,QWORD[((-8))+rcx]
1073 adc rdx,0
1074 add r11,QWORD[((-8))+r14]
1075 adc rdx,0
1076 mov r10,rdx
1077
1078 mul rbp
1079 add rdi,rax
1080 mov rax,QWORD[r9*1+rsi]
1081 adc rdx,0
1082 add rdi,r11
1083 adc rdx,0
1084 mov QWORD[((-24))+r14],r13
1085 mov r13,rdx
1086
1087 mov QWORD[((-16))+r14],rdi
1088 lea rcx,[r9*1+rcx]
1089
1090 xor rdi,rdi
1091 add r13,r10
1092 adc rdi,0
1093 add r13,QWORD[r14]
1094 adc rdi,0
1095 mov QWORD[((-8))+r14],r13
1096
1097 cmp r12,QWORD[((16+8))+rsp]
1098 jb NEAR $L$outer4x
1099 xor rax,rax
1100 sub rbp,r13
1101 adc r15,r15
1102 or rdi,r15
1103 sub rax,rdi
1104 lea rbx,[r9*1+r14]
1105 mov r12,QWORD[rcx]
1106 lea rbp,[rcx]
1107 mov rcx,r9
1108 sar rcx,3+2
1109 mov rdi,QWORD[((56+8))+rsp]
1110 dec r12
1111 xor r10,r10
1112 mov r13,QWORD[8+rbp]
1113 mov r14,QWORD[16+rbp]
1114 mov r15,QWORD[24+rbp]
1115 jmp NEAR $L$sqr4x_sub_entry
1116
1117
David Benjamin3efe2eb2024-05-08 22:24:27 -07001118global bn_power5_nohw
David Benjaminfe0c91e2024-03-18 15:37:24 +10001119
1120ALIGN 32
David Benjamin3efe2eb2024-05-08 22:24:27 -07001121bn_power5_nohw:
David Benjaminfe0c91e2024-03-18 15:37:24 +10001122 mov QWORD[8+rsp],rdi ;WIN64 prologue
1123 mov QWORD[16+rsp],rsi
1124 mov rax,rsp
David Benjamin3efe2eb2024-05-08 22:24:27 -07001125$L$SEH_begin_bn_power5_nohw:
David Benjaminfe0c91e2024-03-18 15:37:24 +10001126 mov rdi,rcx
1127 mov rsi,rdx
1128 mov rdx,r8
1129 mov rcx,r9
1130 mov r8,QWORD[40+rsp]
1131 mov r9,QWORD[48+rsp]
1132
1133
1134
1135_CET_ENDBR
1136 mov rax,rsp
1137
David Benjaminfe0c91e2024-03-18 15:37:24 +10001138 push rbx
1139
1140 push rbp
1141
1142 push r12
1143
1144 push r13
1145
1146 push r14
1147
1148 push r15
1149
1150$L$power5_prologue:
1151
David Benjamin3efe2eb2024-05-08 22:24:27 -07001152
1153
1154
David Benjaminfe0c91e2024-03-18 15:37:24 +10001155 shl r9d,3
1156 lea r10d,[r9*2+r9]
1157 neg r9
1158 mov r8,QWORD[r8]
1159
1160
1161
1162
1163
1164
1165
1166
1167 lea r11,[((-320))+r9*2+rsp]
1168 mov rbp,rsp
1169 sub r11,rdi
1170 and r11,4095
1171 cmp r10,r11
1172 jb NEAR $L$pwr_sp_alt
1173 sub rbp,r11
1174 lea rbp,[((-320))+r9*2+rbp]
1175 jmp NEAR $L$pwr_sp_done
1176
1177ALIGN 32
1178$L$pwr_sp_alt:
1179 lea r10,[((4096-320))+r9*2]
1180 lea rbp,[((-320))+r9*2+rbp]
1181 sub r11,r10
1182 mov r10,0
1183 cmovc r11,r10
1184 sub rbp,r11
1185$L$pwr_sp_done:
1186 and rbp,-64
1187 mov r11,rsp
1188 sub r11,rbp
1189 and r11,-4096
1190 lea rsp,[rbp*1+r11]
1191 mov r10,QWORD[rsp]
1192 cmp rsp,rbp
1193 ja NEAR $L$pwr_page_walk
1194 jmp NEAR $L$pwr_page_walk_done
1195
1196$L$pwr_page_walk:
1197 lea rsp,[((-4096))+rsp]
1198 mov r10,QWORD[rsp]
1199 cmp rsp,rbp
1200 ja NEAR $L$pwr_page_walk
1201$L$pwr_page_walk_done:
1202
1203 mov r10,r9
1204 neg r9
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215 mov QWORD[32+rsp],r8
1216 mov QWORD[40+rsp],rax
1217
1218$L$power5_body:
1219DB 102,72,15,110,207
1220DB 102,72,15,110,209
1221DB 102,73,15,110,218
1222DB 102,72,15,110,226
1223
1224 call __bn_sqr8x_internal
1225 call __bn_post4x_internal
1226 call __bn_sqr8x_internal
1227 call __bn_post4x_internal
1228 call __bn_sqr8x_internal
1229 call __bn_post4x_internal
1230 call __bn_sqr8x_internal
1231 call __bn_post4x_internal
1232 call __bn_sqr8x_internal
1233 call __bn_post4x_internal
1234
1235DB 102,72,15,126,209
1236DB 102,72,15,126,226
1237 mov rdi,rsi
1238 mov rax,QWORD[40+rsp]
1239 lea r8,[32+rsp]
1240
1241 call mul4x_internal
1242
1243 mov rsi,QWORD[40+rsp]
1244
1245 mov rax,1
1246 mov r15,QWORD[((-48))+rsi]
1247
1248 mov r14,QWORD[((-40))+rsi]
1249
1250 mov r13,QWORD[((-32))+rsi]
1251
1252 mov r12,QWORD[((-24))+rsi]
1253
1254 mov rbp,QWORD[((-16))+rsi]
1255
1256 mov rbx,QWORD[((-8))+rsi]
1257
1258 lea rsp,[rsi]
1259
1260$L$power5_epilogue:
1261 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
1262 mov rsi,QWORD[16+rsp]
1263 ret
1264
David Benjamin3efe2eb2024-05-08 22:24:27 -07001265$L$SEH_end_bn_power5_nohw:
David Benjaminfe0c91e2024-03-18 15:37:24 +10001266
1267global bn_sqr8x_internal
1268
1269
1270ALIGN 32
1271bn_sqr8x_internal:
1272__bn_sqr8x_internal:
1273
1274_CET_ENDBR
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348 lea rbp,[32+r10]
1349 lea rsi,[r9*1+rsi]
1350
1351 mov rcx,r9
1352
1353
1354 mov r14,QWORD[((-32))+rbp*1+rsi]
1355 lea rdi,[((48+8))+r9*2+rsp]
1356 mov rax,QWORD[((-24))+rbp*1+rsi]
1357 lea rdi,[((-32))+rbp*1+rdi]
1358 mov rbx,QWORD[((-16))+rbp*1+rsi]
1359 mov r15,rax
1360
1361 mul r14
1362 mov r10,rax
1363 mov rax,rbx
1364 mov r11,rdx
1365 mov QWORD[((-24))+rbp*1+rdi],r10
1366
1367 mul r14
1368 add r11,rax
1369 mov rax,rbx
1370 adc rdx,0
1371 mov QWORD[((-16))+rbp*1+rdi],r11
1372 mov r10,rdx
1373
1374
1375 mov rbx,QWORD[((-8))+rbp*1+rsi]
1376 mul r15
1377 mov r12,rax
1378 mov rax,rbx
1379 mov r13,rdx
1380
1381 lea rcx,[rbp]
1382 mul r14
1383 add r10,rax
1384 mov rax,rbx
1385 mov r11,rdx
1386 adc r11,0
1387 add r10,r12
1388 adc r11,0
1389 mov QWORD[((-8))+rcx*1+rdi],r10
1390 jmp NEAR $L$sqr4x_1st
1391
1392ALIGN 32
1393$L$sqr4x_1st:
1394 mov rbx,QWORD[rcx*1+rsi]
1395 mul r15
1396 add r13,rax
1397 mov rax,rbx
1398 mov r12,rdx
1399 adc r12,0
1400
1401 mul r14
1402 add r11,rax
1403 mov rax,rbx
1404 mov rbx,QWORD[8+rcx*1+rsi]
1405 mov r10,rdx
1406 adc r10,0
1407 add r11,r13
1408 adc r10,0
1409
1410
1411 mul r15
1412 add r12,rax
1413 mov rax,rbx
1414 mov QWORD[rcx*1+rdi],r11
1415 mov r13,rdx
1416 adc r13,0
1417
1418 mul r14
1419 add r10,rax
1420 mov rax,rbx
1421 mov rbx,QWORD[16+rcx*1+rsi]
1422 mov r11,rdx
1423 adc r11,0
1424 add r10,r12
1425 adc r11,0
1426
1427 mul r15
1428 add r13,rax
1429 mov rax,rbx
1430 mov QWORD[8+rcx*1+rdi],r10
1431 mov r12,rdx
1432 adc r12,0
1433
1434 mul r14
1435 add r11,rax
1436 mov rax,rbx
1437 mov rbx,QWORD[24+rcx*1+rsi]
1438 mov r10,rdx
1439 adc r10,0
1440 add r11,r13
1441 adc r10,0
1442
1443
1444 mul r15
1445 add r12,rax
1446 mov rax,rbx
1447 mov QWORD[16+rcx*1+rdi],r11
1448 mov r13,rdx
1449 adc r13,0
1450 lea rcx,[32+rcx]
1451
1452 mul r14
1453 add r10,rax
1454 mov rax,rbx
1455 mov r11,rdx
1456 adc r11,0
1457 add r10,r12
1458 adc r11,0
1459 mov QWORD[((-8))+rcx*1+rdi],r10
1460
1461 cmp rcx,0
1462 jne NEAR $L$sqr4x_1st
1463
1464 mul r15
1465 add r13,rax
1466 lea rbp,[16+rbp]
1467 adc rdx,0
1468 add r13,r11
1469 adc rdx,0
1470
1471 mov QWORD[rdi],r13
1472 mov r12,rdx
1473 mov QWORD[8+rdi],rdx
1474 jmp NEAR $L$sqr4x_outer
1475
1476ALIGN 32
1477$L$sqr4x_outer:
1478 mov r14,QWORD[((-32))+rbp*1+rsi]
1479 lea rdi,[((48+8))+r9*2+rsp]
1480 mov rax,QWORD[((-24))+rbp*1+rsi]
1481 lea rdi,[((-32))+rbp*1+rdi]
1482 mov rbx,QWORD[((-16))+rbp*1+rsi]
1483 mov r15,rax
1484
1485 mul r14
1486 mov r10,QWORD[((-24))+rbp*1+rdi]
1487 add r10,rax
1488 mov rax,rbx
1489 adc rdx,0
1490 mov QWORD[((-24))+rbp*1+rdi],r10
1491 mov r11,rdx
1492
1493 mul r14
1494 add r11,rax
1495 mov rax,rbx
1496 adc rdx,0
1497 add r11,QWORD[((-16))+rbp*1+rdi]
1498 mov r10,rdx
1499 adc r10,0
1500 mov QWORD[((-16))+rbp*1+rdi],r11
1501
1502 xor r12,r12
1503
1504 mov rbx,QWORD[((-8))+rbp*1+rsi]
1505 mul r15
1506 add r12,rax
1507 mov rax,rbx
1508 adc rdx,0
1509 add r12,QWORD[((-8))+rbp*1+rdi]
1510 mov r13,rdx
1511 adc r13,0
1512
1513 mul r14
1514 add r10,rax
1515 mov rax,rbx
1516 adc rdx,0
1517 add r10,r12
1518 mov r11,rdx
1519 adc r11,0
1520 mov QWORD[((-8))+rbp*1+rdi],r10
1521
1522 lea rcx,[rbp]
1523 jmp NEAR $L$sqr4x_inner
1524
1525ALIGN 32
1526$L$sqr4x_inner:
1527 mov rbx,QWORD[rcx*1+rsi]
1528 mul r15
1529 add r13,rax
1530 mov rax,rbx
1531 mov r12,rdx
1532 adc r12,0
1533 add r13,QWORD[rcx*1+rdi]
1534 adc r12,0
1535
1536 DB 0x67
1537 mul r14
1538 add r11,rax
1539 mov rax,rbx
1540 mov rbx,QWORD[8+rcx*1+rsi]
1541 mov r10,rdx
1542 adc r10,0
1543 add r11,r13
1544 adc r10,0
1545
1546 mul r15
1547 add r12,rax
1548 mov QWORD[rcx*1+rdi],r11
1549 mov rax,rbx
1550 mov r13,rdx
1551 adc r13,0
1552 add r12,QWORD[8+rcx*1+rdi]
1553 lea rcx,[16+rcx]
1554 adc r13,0
1555
1556 mul r14
1557 add r10,rax
1558 mov rax,rbx
1559 adc rdx,0
1560 add r10,r12
1561 mov r11,rdx
1562 adc r11,0
1563 mov QWORD[((-8))+rcx*1+rdi],r10
1564
1565 cmp rcx,0
1566 jne NEAR $L$sqr4x_inner
1567
1568 DB 0x67
1569 mul r15
1570 add r13,rax
1571 adc rdx,0
1572 add r13,r11
1573 adc rdx,0
1574
1575 mov QWORD[rdi],r13
1576 mov r12,rdx
1577 mov QWORD[8+rdi],rdx
1578
1579 add rbp,16
1580 jnz NEAR $L$sqr4x_outer
1581
1582
1583 mov r14,QWORD[((-32))+rsi]
1584 lea rdi,[((48+8))+r9*2+rsp]
1585 mov rax,QWORD[((-24))+rsi]
1586 lea rdi,[((-32))+rbp*1+rdi]
1587 mov rbx,QWORD[((-16))+rsi]
1588 mov r15,rax
1589
1590 mul r14
1591 add r10,rax
1592 mov rax,rbx
1593 mov r11,rdx
1594 adc r11,0
1595
1596 mul r14
1597 add r11,rax
1598 mov rax,rbx
1599 mov QWORD[((-24))+rdi],r10
1600 mov r10,rdx
1601 adc r10,0
1602 add r11,r13
1603 mov rbx,QWORD[((-8))+rsi]
1604 adc r10,0
1605
1606 mul r15
1607 add r12,rax
1608 mov rax,rbx
1609 mov QWORD[((-16))+rdi],r11
1610 mov r13,rdx
1611 adc r13,0
1612
1613 mul r14
1614 add r10,rax
1615 mov rax,rbx
1616 mov r11,rdx
1617 adc r11,0
1618 add r10,r12
1619 adc r11,0
1620 mov QWORD[((-8))+rdi],r10
1621
1622 mul r15
1623 add r13,rax
1624 mov rax,QWORD[((-16))+rsi]
1625 adc rdx,0
1626 add r13,r11
1627 adc rdx,0
1628
1629 mov QWORD[rdi],r13
1630 mov r12,rdx
1631 mov QWORD[8+rdi],rdx
1632
1633 mul rbx
1634 add rbp,16
1635 xor r14,r14
1636 sub rbp,r9
1637 xor r15,r15
1638
1639 add rax,r12
1640 adc rdx,0
1641 mov QWORD[8+rdi],rax
1642 mov QWORD[16+rdi],rdx
1643 mov QWORD[24+rdi],r15
1644
1645 mov rax,QWORD[((-16))+rbp*1+rsi]
1646 lea rdi,[((48+8))+rsp]
1647 xor r10,r10
1648 mov r11,QWORD[8+rdi]
1649
1650 lea r12,[r10*2+r14]
1651 shr r10,63
1652 lea r13,[r11*2+rcx]
1653 shr r11,63
1654 or r13,r10
1655 mov r10,QWORD[16+rdi]
1656 mov r14,r11
1657 mul rax
1658 neg r15
1659 mov r11,QWORD[24+rdi]
1660 adc r12,rax
1661 mov rax,QWORD[((-8))+rbp*1+rsi]
1662 mov QWORD[rdi],r12
1663 adc r13,rdx
1664
1665 lea rbx,[r10*2+r14]
1666 mov QWORD[8+rdi],r13
1667 sbb r15,r15
1668 shr r10,63
1669 lea r8,[r11*2+rcx]
1670 shr r11,63
1671 or r8,r10
1672 mov r10,QWORD[32+rdi]
1673 mov r14,r11
1674 mul rax
1675 neg r15
1676 mov r11,QWORD[40+rdi]
1677 adc rbx,rax
1678 mov rax,QWORD[rbp*1+rsi]
1679 mov QWORD[16+rdi],rbx
1680 adc r8,rdx
1681 lea rbp,[16+rbp]
1682 mov QWORD[24+rdi],r8
1683 sbb r15,r15
1684 lea rdi,[64+rdi]
1685 jmp NEAR $L$sqr4x_shift_n_add
1686
1687ALIGN 32
1688$L$sqr4x_shift_n_add:
1689 lea r12,[r10*2+r14]
1690 shr r10,63
1691 lea r13,[r11*2+rcx]
1692 shr r11,63
1693 or r13,r10
1694 mov r10,QWORD[((-16))+rdi]
1695 mov r14,r11
1696 mul rax
1697 neg r15
1698 mov r11,QWORD[((-8))+rdi]
1699 adc r12,rax
1700 mov rax,QWORD[((-8))+rbp*1+rsi]
1701 mov QWORD[((-32))+rdi],r12
1702 adc r13,rdx
1703
1704 lea rbx,[r10*2+r14]
1705 mov QWORD[((-24))+rdi],r13
1706 sbb r15,r15
1707 shr r10,63
1708 lea r8,[r11*2+rcx]
1709 shr r11,63
1710 or r8,r10
1711 mov r10,QWORD[rdi]
1712 mov r14,r11
1713 mul rax
1714 neg r15
1715 mov r11,QWORD[8+rdi]
1716 adc rbx,rax
1717 mov rax,QWORD[rbp*1+rsi]
1718 mov QWORD[((-16))+rdi],rbx
1719 adc r8,rdx
1720
1721 lea r12,[r10*2+r14]
1722 mov QWORD[((-8))+rdi],r8
1723 sbb r15,r15
1724 shr r10,63
1725 lea r13,[r11*2+rcx]
1726 shr r11,63
1727 or r13,r10
1728 mov r10,QWORD[16+rdi]
1729 mov r14,r11
1730 mul rax
1731 neg r15
1732 mov r11,QWORD[24+rdi]
1733 adc r12,rax
1734 mov rax,QWORD[8+rbp*1+rsi]
1735 mov QWORD[rdi],r12
1736 adc r13,rdx
1737
1738 lea rbx,[r10*2+r14]
1739 mov QWORD[8+rdi],r13
1740 sbb r15,r15
1741 shr r10,63
1742 lea r8,[r11*2+rcx]
1743 shr r11,63
1744 or r8,r10
1745 mov r10,QWORD[32+rdi]
1746 mov r14,r11
1747 mul rax
1748 neg r15
1749 mov r11,QWORD[40+rdi]
1750 adc rbx,rax
1751 mov rax,QWORD[16+rbp*1+rsi]
1752 mov QWORD[16+rdi],rbx
1753 adc r8,rdx
1754 mov QWORD[24+rdi],r8
1755 sbb r15,r15
1756 lea rdi,[64+rdi]
1757 add rbp,32
1758 jnz NEAR $L$sqr4x_shift_n_add
1759
1760 lea r12,[r10*2+r14]
1761 DB 0x67
1762 shr r10,63
1763 lea r13,[r11*2+rcx]
1764 shr r11,63
1765 or r13,r10
1766 mov r10,QWORD[((-16))+rdi]
1767 mov r14,r11
1768 mul rax
1769 neg r15
1770 mov r11,QWORD[((-8))+rdi]
1771 adc r12,rax
1772 mov rax,QWORD[((-8))+rsi]
1773 mov QWORD[((-32))+rdi],r12
1774 adc r13,rdx
1775
1776 lea rbx,[r10*2+r14]
1777 mov QWORD[((-24))+rdi],r13
1778 sbb r15,r15
1779 shr r10,63
1780 lea r8,[r11*2+rcx]
1781 shr r11,63
1782 or r8,r10
1783 mul rax
1784 neg r15
1785 adc rbx,rax
1786 adc r8,rdx
1787 mov QWORD[((-16))+rdi],rbx
1788 mov QWORD[((-8))+rdi],r8
1789DB 102,72,15,126,213
1790__bn_sqr8x_reduction:
1791 xor rax,rax
1792 lea rcx,[rbp*1+r9]
1793 lea rdx,[((48+8))+r9*2+rsp]
1794 mov QWORD[((0+8))+rsp],rcx
1795 lea rdi,[((48+8))+r9*1+rsp]
1796 mov QWORD[((8+8))+rsp],rdx
1797 neg r9
1798 jmp NEAR $L$8x_reduction_loop
1799
1800ALIGN 32
1801$L$8x_reduction_loop:
1802 lea rdi,[r9*1+rdi]
1803 DB 0x66
1804 mov rbx,QWORD[rdi]
1805 mov r9,QWORD[8+rdi]
1806 mov r10,QWORD[16+rdi]
1807 mov r11,QWORD[24+rdi]
1808 mov r12,QWORD[32+rdi]
1809 mov r13,QWORD[40+rdi]
1810 mov r14,QWORD[48+rdi]
1811 mov r15,QWORD[56+rdi]
1812 mov QWORD[rdx],rax
1813 lea rdi,[64+rdi]
1814
1815 DB 0x67
1816 mov r8,rbx
1817 imul rbx,QWORD[((32+8))+rsp]
1818 mov rax,QWORD[rbp]
1819 mov ecx,8
1820 jmp NEAR $L$8x_reduce
1821
1822ALIGN 32
1823$L$8x_reduce:
1824 mul rbx
1825 mov rax,QWORD[8+rbp]
1826 neg r8
1827 mov r8,rdx
1828 adc r8,0
1829
1830 mul rbx
1831 add r9,rax
1832 mov rax,QWORD[16+rbp]
1833 adc rdx,0
1834 add r8,r9
1835 mov QWORD[((48-8+8))+rcx*8+rsp],rbx
1836 mov r9,rdx
1837 adc r9,0
1838
1839 mul rbx
1840 add r10,rax
1841 mov rax,QWORD[24+rbp]
1842 adc rdx,0
1843 add r9,r10
1844 mov rsi,QWORD[((32+8))+rsp]
1845 mov r10,rdx
1846 adc r10,0
1847
1848 mul rbx
1849 add r11,rax
1850 mov rax,QWORD[32+rbp]
1851 adc rdx,0
1852 imul rsi,r8
1853 add r10,r11
1854 mov r11,rdx
1855 adc r11,0
1856
1857 mul rbx
1858 add r12,rax
1859 mov rax,QWORD[40+rbp]
1860 adc rdx,0
1861 add r11,r12
1862 mov r12,rdx
1863 adc r12,0
1864
1865 mul rbx
1866 add r13,rax
1867 mov rax,QWORD[48+rbp]
1868 adc rdx,0
1869 add r12,r13
1870 mov r13,rdx
1871 adc r13,0
1872
1873 mul rbx
1874 add r14,rax
1875 mov rax,QWORD[56+rbp]
1876 adc rdx,0
1877 add r13,r14
1878 mov r14,rdx
1879 adc r14,0
1880
1881 mul rbx
1882 mov rbx,rsi
1883 add r15,rax
1884 mov rax,QWORD[rbp]
1885 adc rdx,0
1886 add r14,r15
1887 mov r15,rdx
1888 adc r15,0
1889
1890 dec ecx
1891 jnz NEAR $L$8x_reduce
1892
1893 lea rbp,[64+rbp]
1894 xor rax,rax
1895 mov rdx,QWORD[((8+8))+rsp]
1896 cmp rbp,QWORD[((0+8))+rsp]
1897 jae NEAR $L$8x_no_tail
1898
1899 DB 0x66
1900 add r8,QWORD[rdi]
1901 adc r9,QWORD[8+rdi]
1902 adc r10,QWORD[16+rdi]
1903 adc r11,QWORD[24+rdi]
1904 adc r12,QWORD[32+rdi]
1905 adc r13,QWORD[40+rdi]
1906 adc r14,QWORD[48+rdi]
1907 adc r15,QWORD[56+rdi]
1908 sbb rsi,rsi
1909
1910 mov rbx,QWORD[((48+56+8))+rsp]
1911 mov ecx,8
1912 mov rax,QWORD[rbp]
1913 jmp NEAR $L$8x_tail
1914
1915ALIGN 32
1916$L$8x_tail:
1917 mul rbx
1918 add r8,rax
1919 mov rax,QWORD[8+rbp]
1920 mov QWORD[rdi],r8
1921 mov r8,rdx
1922 adc r8,0
1923
1924 mul rbx
1925 add r9,rax
1926 mov rax,QWORD[16+rbp]
1927 adc rdx,0
1928 add r8,r9
1929 lea rdi,[8+rdi]
1930 mov r9,rdx
1931 adc r9,0
1932
1933 mul rbx
1934 add r10,rax
1935 mov rax,QWORD[24+rbp]
1936 adc rdx,0
1937 add r9,r10
1938 mov r10,rdx
1939 adc r10,0
1940
1941 mul rbx
1942 add r11,rax
1943 mov rax,QWORD[32+rbp]
1944 adc rdx,0
1945 add r10,r11
1946 mov r11,rdx
1947 adc r11,0
1948
1949 mul rbx
1950 add r12,rax
1951 mov rax,QWORD[40+rbp]
1952 adc rdx,0
1953 add r11,r12
1954 mov r12,rdx
1955 adc r12,0
1956
1957 mul rbx
1958 add r13,rax
1959 mov rax,QWORD[48+rbp]
1960 adc rdx,0
1961 add r12,r13
1962 mov r13,rdx
1963 adc r13,0
1964
1965 mul rbx
1966 add r14,rax
1967 mov rax,QWORD[56+rbp]
1968 adc rdx,0
1969 add r13,r14
1970 mov r14,rdx
1971 adc r14,0
1972
1973 mul rbx
1974 mov rbx,QWORD[((48-16+8))+rcx*8+rsp]
1975 add r15,rax
1976 adc rdx,0
1977 add r14,r15
1978 mov rax,QWORD[rbp]
1979 mov r15,rdx
1980 adc r15,0
1981
1982 dec ecx
1983 jnz NEAR $L$8x_tail
1984
1985 lea rbp,[64+rbp]
1986 mov rdx,QWORD[((8+8))+rsp]
1987 cmp rbp,QWORD[((0+8))+rsp]
1988 jae NEAR $L$8x_tail_done
1989
1990 mov rbx,QWORD[((48+56+8))+rsp]
1991 neg rsi
1992 mov rax,QWORD[rbp]
1993 adc r8,QWORD[rdi]
1994 adc r9,QWORD[8+rdi]
1995 adc r10,QWORD[16+rdi]
1996 adc r11,QWORD[24+rdi]
1997 adc r12,QWORD[32+rdi]
1998 adc r13,QWORD[40+rdi]
1999 adc r14,QWORD[48+rdi]
2000 adc r15,QWORD[56+rdi]
2001 sbb rsi,rsi
2002
2003 mov ecx,8
2004 jmp NEAR $L$8x_tail
2005
2006ALIGN 32
2007$L$8x_tail_done:
2008 xor rax,rax
2009 add r8,QWORD[rdx]
2010 adc r9,0
2011 adc r10,0
2012 adc r11,0
2013 adc r12,0
2014 adc r13,0
2015 adc r14,0
2016 adc r15,0
2017 adc rax,0
2018
2019 neg rsi
2020$L$8x_no_tail:
2021 adc r8,QWORD[rdi]
2022 adc r9,QWORD[8+rdi]
2023 adc r10,QWORD[16+rdi]
2024 adc r11,QWORD[24+rdi]
2025 adc r12,QWORD[32+rdi]
2026 adc r13,QWORD[40+rdi]
2027 adc r14,QWORD[48+rdi]
2028 adc r15,QWORD[56+rdi]
2029 adc rax,0
2030 mov rcx,QWORD[((-8))+rbp]
2031 xor rsi,rsi
2032
2033DB 102,72,15,126,213
2034
2035 mov QWORD[rdi],r8
2036 mov QWORD[8+rdi],r9
2037DB 102,73,15,126,217
2038 mov QWORD[16+rdi],r10
2039 mov QWORD[24+rdi],r11
2040 mov QWORD[32+rdi],r12
2041 mov QWORD[40+rdi],r13
2042 mov QWORD[48+rdi],r14
2043 mov QWORD[56+rdi],r15
2044 lea rdi,[64+rdi]
2045
2046 cmp rdi,rdx
2047 jb NEAR $L$8x_reduction_loop
2048 ret
2049
2050
2051
2052ALIGN 32
2053__bn_post4x_internal:
2054
2055 mov r12,QWORD[rbp]
2056 lea rbx,[r9*1+rdi]
2057 mov rcx,r9
2058DB 102,72,15,126,207
2059 neg rax
2060DB 102,72,15,126,206
2061 sar rcx,3+2
2062 dec r12
2063 xor r10,r10
2064 mov r13,QWORD[8+rbp]
2065 mov r14,QWORD[16+rbp]
2066 mov r15,QWORD[24+rbp]
2067 jmp NEAR $L$sqr4x_sub_entry
2068
2069ALIGN 16
2070$L$sqr4x_sub:
2071 mov r12,QWORD[rbp]
2072 mov r13,QWORD[8+rbp]
2073 mov r14,QWORD[16+rbp]
2074 mov r15,QWORD[24+rbp]
2075$L$sqr4x_sub_entry:
2076 lea rbp,[32+rbp]
2077 not r12
2078 not r13
2079 not r14
2080 not r15
2081 and r12,rax
2082 and r13,rax
2083 and r14,rax
2084 and r15,rax
2085
2086 neg r10
2087 adc r12,QWORD[rbx]
2088 adc r13,QWORD[8+rbx]
2089 adc r14,QWORD[16+rbx]
2090 adc r15,QWORD[24+rbx]
2091 mov QWORD[rdi],r12
2092 lea rbx,[32+rbx]
2093 mov QWORD[8+rdi],r13
2094 sbb r10,r10
2095 mov QWORD[16+rdi],r14
2096 mov QWORD[24+rdi],r15
2097 lea rdi,[32+rdi]
2098
2099 inc rcx
2100 jnz NEAR $L$sqr4x_sub
2101
2102 mov r10,r9
2103 neg r9
2104 ret
2105
2106
David Benjamin3efe2eb2024-05-08 22:24:27 -07002107global bn_mulx4x_mont_gather5
David Benjaminfe0c91e2024-03-18 15:37:24 +10002108
2109ALIGN 32
2110bn_mulx4x_mont_gather5:
2111 mov QWORD[8+rsp],rdi ;WIN64 prologue
2112 mov QWORD[16+rsp],rsi
2113 mov rax,rsp
2114$L$SEH_begin_bn_mulx4x_mont_gather5:
2115 mov rdi,rcx
2116 mov rsi,rdx
2117 mov rdx,r8
2118 mov rcx,r9
2119 mov r8,QWORD[40+rsp]
2120 mov r9,QWORD[48+rsp]
2121
2122
2123
David Benjamin3efe2eb2024-05-08 22:24:27 -07002124_CET_ENDBR
David Benjaminfe0c91e2024-03-18 15:37:24 +10002125 mov rax,rsp
2126
David Benjaminfe0c91e2024-03-18 15:37:24 +10002127 push rbx
2128
2129 push rbp
2130
2131 push r12
2132
2133 push r13
2134
2135 push r14
2136
2137 push r15
2138
2139$L$mulx4x_prologue:
2140
David Benjamin3efe2eb2024-05-08 22:24:27 -07002141
2142
2143
David Benjaminfe0c91e2024-03-18 15:37:24 +10002144 shl r9d,3
2145 lea r10,[r9*2+r9]
2146 neg r9
2147 mov r8,QWORD[r8]
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158 lea r11,[((-320))+r9*2+rsp]
2159 mov rbp,rsp
2160 sub r11,rdi
2161 and r11,4095
2162 cmp r10,r11
2163 jb NEAR $L$mulx4xsp_alt
2164 sub rbp,r11
2165 lea rbp,[((-320))+r9*2+rbp]
2166 jmp NEAR $L$mulx4xsp_done
2167
2168$L$mulx4xsp_alt:
2169 lea r10,[((4096-320))+r9*2]
2170 lea rbp,[((-320))+r9*2+rbp]
2171 sub r11,r10
2172 mov r10,0
2173 cmovc r11,r10
2174 sub rbp,r11
2175$L$mulx4xsp_done:
2176 and rbp,-64
2177 mov r11,rsp
2178 sub r11,rbp
2179 and r11,-4096
2180 lea rsp,[rbp*1+r11]
2181 mov r10,QWORD[rsp]
2182 cmp rsp,rbp
2183 ja NEAR $L$mulx4x_page_walk
2184 jmp NEAR $L$mulx4x_page_walk_done
2185
2186$L$mulx4x_page_walk:
2187 lea rsp,[((-4096))+rsp]
2188 mov r10,QWORD[rsp]
2189 cmp rsp,rbp
2190 ja NEAR $L$mulx4x_page_walk
2191$L$mulx4x_page_walk_done:
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205 mov QWORD[32+rsp],r8
2206 mov QWORD[40+rsp],rax
2207
2208$L$mulx4x_body:
2209 call mulx4x_internal
2210
2211 mov rsi,QWORD[40+rsp]
2212
2213 mov rax,1
2214
2215 mov r15,QWORD[((-48))+rsi]
2216
2217 mov r14,QWORD[((-40))+rsi]
2218
2219 mov r13,QWORD[((-32))+rsi]
2220
2221 mov r12,QWORD[((-24))+rsi]
2222
2223 mov rbp,QWORD[((-16))+rsi]
2224
2225 mov rbx,QWORD[((-8))+rsi]
2226
2227 lea rsp,[rsi]
2228
2229$L$mulx4x_epilogue:
2230 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
2231 mov rsi,QWORD[16+rsp]
2232 ret
2233
2234$L$SEH_end_bn_mulx4x_mont_gather5:
2235
2236
2237ALIGN 32
2238mulx4x_internal:
2239
2240 mov QWORD[8+rsp],r9
2241 mov r10,r9
2242 neg r9
2243 shl r9,5
2244 neg r10
2245 lea r13,[128+r9*1+rdx]
2246 shr r9,5+5
2247 movd xmm5,DWORD[56+rax]
2248 sub r9,1
2249 lea rax,[$L$inc]
2250 mov QWORD[((16+8))+rsp],r13
2251 mov QWORD[((24+8))+rsp],r9
2252 mov QWORD[((56+8))+rsp],rdi
2253 movdqa xmm0,XMMWORD[rax]
2254 movdqa xmm1,XMMWORD[16+rax]
2255 lea r10,[((88-112))+r10*1+rsp]
2256 lea rdi,[128+rdx]
2257
2258 pshufd xmm5,xmm5,0
2259 movdqa xmm4,xmm1
2260 DB 0x67
2261 movdqa xmm2,xmm1
2262 DB 0x67
2263 paddd xmm1,xmm0
2264 pcmpeqd xmm0,xmm5
2265 movdqa xmm3,xmm4
2266 paddd xmm2,xmm1
2267 pcmpeqd xmm1,xmm5
2268 movdqa XMMWORD[112+r10],xmm0
2269 movdqa xmm0,xmm4
2270
2271 paddd xmm3,xmm2
2272 pcmpeqd xmm2,xmm5
2273 movdqa XMMWORD[128+r10],xmm1
2274 movdqa xmm1,xmm4
2275
2276 paddd xmm0,xmm3
2277 pcmpeqd xmm3,xmm5
2278 movdqa XMMWORD[144+r10],xmm2
2279 movdqa xmm2,xmm4
2280
2281 paddd xmm1,xmm0
2282 pcmpeqd xmm0,xmm5
2283 movdqa XMMWORD[160+r10],xmm3
2284 movdqa xmm3,xmm4
2285 paddd xmm2,xmm1
2286 pcmpeqd xmm1,xmm5
2287 movdqa XMMWORD[176+r10],xmm0
2288 movdqa xmm0,xmm4
2289
2290 paddd xmm3,xmm2
2291 pcmpeqd xmm2,xmm5
2292 movdqa XMMWORD[192+r10],xmm1
2293 movdqa xmm1,xmm4
2294
2295 paddd xmm0,xmm3
2296 pcmpeqd xmm3,xmm5
2297 movdqa XMMWORD[208+r10],xmm2
2298 movdqa xmm2,xmm4
2299
2300 paddd xmm1,xmm0
2301 pcmpeqd xmm0,xmm5
2302 movdqa XMMWORD[224+r10],xmm3
2303 movdqa xmm3,xmm4
2304 paddd xmm2,xmm1
2305 pcmpeqd xmm1,xmm5
2306 movdqa XMMWORD[240+r10],xmm0
2307 movdqa xmm0,xmm4
2308
2309 paddd xmm3,xmm2
2310 pcmpeqd xmm2,xmm5
2311 movdqa XMMWORD[256+r10],xmm1
2312 movdqa xmm1,xmm4
2313
2314 paddd xmm0,xmm3
2315 pcmpeqd xmm3,xmm5
2316 movdqa XMMWORD[272+r10],xmm2
2317 movdqa xmm2,xmm4
2318
2319 paddd xmm1,xmm0
2320 pcmpeqd xmm0,xmm5
2321 movdqa XMMWORD[288+r10],xmm3
2322 movdqa xmm3,xmm4
2323 DB 0x67
2324 paddd xmm2,xmm1
2325 pcmpeqd xmm1,xmm5
2326 movdqa XMMWORD[304+r10],xmm0
2327
2328 paddd xmm3,xmm2
2329 pcmpeqd xmm2,xmm5
2330 movdqa XMMWORD[320+r10],xmm1
2331
2332 pcmpeqd xmm3,xmm5
2333 movdqa XMMWORD[336+r10],xmm2
2334
2335 pand xmm0,XMMWORD[64+rdi]
2336 pand xmm1,XMMWORD[80+rdi]
2337 pand xmm2,XMMWORD[96+rdi]
2338 movdqa XMMWORD[352+r10],xmm3
2339 pand xmm3,XMMWORD[112+rdi]
2340 por xmm0,xmm2
2341 por xmm1,xmm3
2342 movdqa xmm4,XMMWORD[((-128))+rdi]
2343 movdqa xmm5,XMMWORD[((-112))+rdi]
2344 movdqa xmm2,XMMWORD[((-96))+rdi]
2345 pand xmm4,XMMWORD[112+r10]
2346 movdqa xmm3,XMMWORD[((-80))+rdi]
2347 pand xmm5,XMMWORD[128+r10]
2348 por xmm0,xmm4
2349 pand xmm2,XMMWORD[144+r10]
2350 por xmm1,xmm5
2351 pand xmm3,XMMWORD[160+r10]
2352 por xmm0,xmm2
2353 por xmm1,xmm3
2354 movdqa xmm4,XMMWORD[((-64))+rdi]
2355 movdqa xmm5,XMMWORD[((-48))+rdi]
2356 movdqa xmm2,XMMWORD[((-32))+rdi]
2357 pand xmm4,XMMWORD[176+r10]
2358 movdqa xmm3,XMMWORD[((-16))+rdi]
2359 pand xmm5,XMMWORD[192+r10]
2360 por xmm0,xmm4
2361 pand xmm2,XMMWORD[208+r10]
2362 por xmm1,xmm5
2363 pand xmm3,XMMWORD[224+r10]
2364 por xmm0,xmm2
2365 por xmm1,xmm3
2366 movdqa xmm4,XMMWORD[rdi]
2367 movdqa xmm5,XMMWORD[16+rdi]
2368 movdqa xmm2,XMMWORD[32+rdi]
2369 pand xmm4,XMMWORD[240+r10]
2370 movdqa xmm3,XMMWORD[48+rdi]
2371 pand xmm5,XMMWORD[256+r10]
2372 por xmm0,xmm4
2373 pand xmm2,XMMWORD[272+r10]
2374 por xmm1,xmm5
2375 pand xmm3,XMMWORD[288+r10]
2376 por xmm0,xmm2
2377 por xmm1,xmm3
2378 pxor xmm0,xmm1
2379
2380 pshufd xmm1,xmm0,0x4e
2381 por xmm0,xmm1
2382 lea rdi,[256+rdi]
2383DB 102,72,15,126,194
2384 lea rbx,[((64+32+8))+rsp]
2385
2386 mov r9,rdx
2387 mulx rax,r8,QWORD[rsi]
2388 mulx r12,r11,QWORD[8+rsi]
2389 add r11,rax
2390 mulx r13,rax,QWORD[16+rsi]
2391 adc r12,rax
2392 adc r13,0
2393 mulx r14,rax,QWORD[24+rsi]
2394
2395 mov r15,r8
2396 imul r8,QWORD[((32+8))+rsp]
2397 xor rbp,rbp
2398 mov rdx,r8
2399
2400 mov QWORD[((8+8))+rsp],rdi
2401
2402 lea rsi,[32+rsi]
2403 adcx r13,rax
2404 adcx r14,rbp
2405
2406 mulx r10,rax,QWORD[rcx]
2407 adcx r15,rax
2408 adox r10,r11
2409 mulx r11,rax,QWORD[8+rcx]
2410 adcx r10,rax
2411 adox r11,r12
2412 mulx r12,rax,QWORD[16+rcx]
2413 mov rdi,QWORD[((24+8))+rsp]
2414 mov QWORD[((-32))+rbx],r10
2415 adcx r11,rax
2416 adox r12,r13
2417 mulx r15,rax,QWORD[24+rcx]
2418 mov rdx,r9
2419 mov QWORD[((-24))+rbx],r11
2420 adcx r12,rax
2421 adox r15,rbp
2422 lea rcx,[32+rcx]
2423 mov QWORD[((-16))+rbx],r12
2424 jmp NEAR $L$mulx4x_1st
2425
2426ALIGN 32
2427$L$mulx4x_1st:
2428 adcx r15,rbp
2429 mulx rax,r10,QWORD[rsi]
2430 adcx r10,r14
2431 mulx r14,r11,QWORD[8+rsi]
2432 adcx r11,rax
2433 mulx rax,r12,QWORD[16+rsi]
2434 adcx r12,r14
2435 mulx r14,r13,QWORD[24+rsi]
2436 DB 0x67,0x67
2437 mov rdx,r8
2438 adcx r13,rax
2439 adcx r14,rbp
2440 lea rsi,[32+rsi]
2441 lea rbx,[32+rbx]
2442
2443 adox r10,r15
2444 mulx r15,rax,QWORD[rcx]
2445 adcx r10,rax
2446 adox r11,r15
2447 mulx r15,rax,QWORD[8+rcx]
2448 adcx r11,rax
2449 adox r12,r15
2450 mulx r15,rax,QWORD[16+rcx]
2451 mov QWORD[((-40))+rbx],r10
2452 adcx r12,rax
2453 mov QWORD[((-32))+rbx],r11
2454 adox r13,r15
2455 mulx r15,rax,QWORD[24+rcx]
2456 mov rdx,r9
2457 mov QWORD[((-24))+rbx],r12
2458 adcx r13,rax
2459 adox r15,rbp
2460 lea rcx,[32+rcx]
2461 mov QWORD[((-16))+rbx],r13
2462
2463 dec rdi
2464 jnz NEAR $L$mulx4x_1st
2465
2466 mov rax,QWORD[8+rsp]
2467 adc r15,rbp
2468 lea rsi,[rax*1+rsi]
2469 add r14,r15
2470 mov rdi,QWORD[((8+8))+rsp]
2471 adc rbp,rbp
2472 mov QWORD[((-8))+rbx],r14
2473 jmp NEAR $L$mulx4x_outer
2474
2475ALIGN 32
2476$L$mulx4x_outer:
2477 lea r10,[((16-256))+rbx]
2478 pxor xmm4,xmm4
2479 DB 0x67,0x67
2480 pxor xmm5,xmm5
2481 movdqa xmm0,XMMWORD[((-128))+rdi]
2482 movdqa xmm1,XMMWORD[((-112))+rdi]
2483 movdqa xmm2,XMMWORD[((-96))+rdi]
2484 pand xmm0,XMMWORD[256+r10]
2485 movdqa xmm3,XMMWORD[((-80))+rdi]
2486 pand xmm1,XMMWORD[272+r10]
2487 por xmm4,xmm0
2488 pand xmm2,XMMWORD[288+r10]
2489 por xmm5,xmm1
2490 pand xmm3,XMMWORD[304+r10]
2491 por xmm4,xmm2
2492 por xmm5,xmm3
2493 movdqa xmm0,XMMWORD[((-64))+rdi]
2494 movdqa xmm1,XMMWORD[((-48))+rdi]
2495 movdqa xmm2,XMMWORD[((-32))+rdi]
2496 pand xmm0,XMMWORD[320+r10]
2497 movdqa xmm3,XMMWORD[((-16))+rdi]
2498 pand xmm1,XMMWORD[336+r10]
2499 por xmm4,xmm0
2500 pand xmm2,XMMWORD[352+r10]
2501 por xmm5,xmm1
2502 pand xmm3,XMMWORD[368+r10]
2503 por xmm4,xmm2
2504 por xmm5,xmm3
2505 movdqa xmm0,XMMWORD[rdi]
2506 movdqa xmm1,XMMWORD[16+rdi]
2507 movdqa xmm2,XMMWORD[32+rdi]
2508 pand xmm0,XMMWORD[384+r10]
2509 movdqa xmm3,XMMWORD[48+rdi]
2510 pand xmm1,XMMWORD[400+r10]
2511 por xmm4,xmm0
2512 pand xmm2,XMMWORD[416+r10]
2513 por xmm5,xmm1
2514 pand xmm3,XMMWORD[432+r10]
2515 por xmm4,xmm2
2516 por xmm5,xmm3
2517 movdqa xmm0,XMMWORD[64+rdi]
2518 movdqa xmm1,XMMWORD[80+rdi]
2519 movdqa xmm2,XMMWORD[96+rdi]
2520 pand xmm0,XMMWORD[448+r10]
2521 movdqa xmm3,XMMWORD[112+rdi]
2522 pand xmm1,XMMWORD[464+r10]
2523 por xmm4,xmm0
2524 pand xmm2,XMMWORD[480+r10]
2525 por xmm5,xmm1
2526 pand xmm3,XMMWORD[496+r10]
2527 por xmm4,xmm2
2528 por xmm5,xmm3
2529 por xmm4,xmm5
2530
2531 pshufd xmm0,xmm4,0x4e
2532 por xmm0,xmm4
2533 lea rdi,[256+rdi]
2534DB 102,72,15,126,194
2535
2536 mov QWORD[rbx],rbp
2537 lea rbx,[32+rax*1+rbx]
2538 mulx r11,r8,QWORD[rsi]
2539 xor rbp,rbp
2540 mov r9,rdx
2541 mulx r12,r14,QWORD[8+rsi]
2542 adox r8,QWORD[((-32))+rbx]
2543 adcx r11,r14
2544 mulx r13,r15,QWORD[16+rsi]
2545 adox r11,QWORD[((-24))+rbx]
2546 adcx r12,r15
2547 mulx r14,rdx,QWORD[24+rsi]
2548 adox r12,QWORD[((-16))+rbx]
2549 adcx r13,rdx
2550 lea rcx,[rax*1+rcx]
2551 lea rsi,[32+rsi]
2552 adox r13,QWORD[((-8))+rbx]
2553 adcx r14,rbp
2554 adox r14,rbp
2555
2556 mov r15,r8
2557 imul r8,QWORD[((32+8))+rsp]
2558
2559 mov rdx,r8
2560 xor rbp,rbp
2561 mov QWORD[((8+8))+rsp],rdi
2562
2563 mulx r10,rax,QWORD[rcx]
2564 adcx r15,rax
2565 adox r10,r11
2566 mulx r11,rax,QWORD[8+rcx]
2567 adcx r10,rax
2568 adox r11,r12
2569 mulx r12,rax,QWORD[16+rcx]
2570 adcx r11,rax
2571 adox r12,r13
2572 mulx r15,rax,QWORD[24+rcx]
2573 mov rdx,r9
2574 mov rdi,QWORD[((24+8))+rsp]
2575 mov QWORD[((-32))+rbx],r10
2576 adcx r12,rax
2577 mov QWORD[((-24))+rbx],r11
2578 adox r15,rbp
2579 mov QWORD[((-16))+rbx],r12
2580 lea rcx,[32+rcx]
2581 jmp NEAR $L$mulx4x_inner
2582
2583ALIGN 32
2584$L$mulx4x_inner:
2585 mulx rax,r10,QWORD[rsi]
2586 adcx r15,rbp
2587 adox r10,r14
2588 mulx r14,r11,QWORD[8+rsi]
2589 adcx r10,QWORD[rbx]
2590 adox r11,rax
2591 mulx rax,r12,QWORD[16+rsi]
2592 adcx r11,QWORD[8+rbx]
2593 adox r12,r14
2594 mulx r14,r13,QWORD[24+rsi]
2595 mov rdx,r8
2596 adcx r12,QWORD[16+rbx]
2597 adox r13,rax
2598 adcx r13,QWORD[24+rbx]
2599 adox r14,rbp
2600 lea rsi,[32+rsi]
2601 lea rbx,[32+rbx]
2602 adcx r14,rbp
2603
2604 adox r10,r15
2605 mulx r15,rax,QWORD[rcx]
2606 adcx r10,rax
2607 adox r11,r15
2608 mulx r15,rax,QWORD[8+rcx]
2609 adcx r11,rax
2610 adox r12,r15
2611 mulx r15,rax,QWORD[16+rcx]
2612 mov QWORD[((-40))+rbx],r10
2613 adcx r12,rax
2614 adox r13,r15
2615 mov QWORD[((-32))+rbx],r11
2616 mulx r15,rax,QWORD[24+rcx]
2617 mov rdx,r9
2618 lea rcx,[32+rcx]
2619 mov QWORD[((-24))+rbx],r12
2620 adcx r13,rax
2621 adox r15,rbp
2622 mov QWORD[((-16))+rbx],r13
2623
2624 dec rdi
2625 jnz NEAR $L$mulx4x_inner
2626
2627 mov rax,QWORD[((0+8))+rsp]
2628 adc r15,rbp
2629 sub rdi,QWORD[rbx]
2630 mov rdi,QWORD[((8+8))+rsp]
2631 mov r10,QWORD[((16+8))+rsp]
2632 adc r14,r15
2633 lea rsi,[rax*1+rsi]
2634 adc rbp,rbp
2635 mov QWORD[((-8))+rbx],r14
2636
2637 cmp rdi,r10
2638 jb NEAR $L$mulx4x_outer
2639
2640 mov r10,QWORD[((-8))+rcx]
2641 mov r8,rbp
2642 mov r12,QWORD[rax*1+rcx]
2643 lea rbp,[rax*1+rcx]
2644 mov rcx,rax
2645 lea rdi,[rax*1+rbx]
2646 xor eax,eax
2647 xor r15,r15
2648 sub r10,r14
2649 adc r15,r15
2650 or r8,r15
2651 sar rcx,3+2
2652 sub rax,r8
2653 mov rdx,QWORD[((56+8))+rsp]
2654 dec r12
2655 mov r13,QWORD[8+rbp]
2656 xor r8,r8
2657 mov r14,QWORD[16+rbp]
2658 mov r15,QWORD[24+rbp]
2659 jmp NEAR $L$sqrx4x_sub_entry
2660
2661
David Benjamin3efe2eb2024-05-08 22:24:27 -07002662global bn_powerx5
David Benjaminfe0c91e2024-03-18 15:37:24 +10002663
2664ALIGN 32
2665bn_powerx5:
2666 mov QWORD[8+rsp],rdi ;WIN64 prologue
2667 mov QWORD[16+rsp],rsi
2668 mov rax,rsp
2669$L$SEH_begin_bn_powerx5:
2670 mov rdi,rcx
2671 mov rsi,rdx
2672 mov rdx,r8
2673 mov rcx,r9
2674 mov r8,QWORD[40+rsp]
2675 mov r9,QWORD[48+rsp]
2676
2677
2678
David Benjamin3efe2eb2024-05-08 22:24:27 -07002679_CET_ENDBR
David Benjaminfe0c91e2024-03-18 15:37:24 +10002680 mov rax,rsp
2681
David Benjaminfe0c91e2024-03-18 15:37:24 +10002682 push rbx
2683
2684 push rbp
2685
2686 push r12
2687
2688 push r13
2689
2690 push r14
2691
2692 push r15
2693
2694$L$powerx5_prologue:
2695
David Benjamin3efe2eb2024-05-08 22:24:27 -07002696
2697
2698
David Benjaminfe0c91e2024-03-18 15:37:24 +10002699 shl r9d,3
2700 lea r10,[r9*2+r9]
2701 neg r9
2702 mov r8,QWORD[r8]
2703
2704
2705
2706
2707
2708
2709
2710
2711 lea r11,[((-320))+r9*2+rsp]
2712 mov rbp,rsp
2713 sub r11,rdi
2714 and r11,4095
2715 cmp r10,r11
2716 jb NEAR $L$pwrx_sp_alt
2717 sub rbp,r11
2718 lea rbp,[((-320))+r9*2+rbp]
2719 jmp NEAR $L$pwrx_sp_done
2720
2721ALIGN 32
2722$L$pwrx_sp_alt:
2723 lea r10,[((4096-320))+r9*2]
2724 lea rbp,[((-320))+r9*2+rbp]
2725 sub r11,r10
2726 mov r10,0
2727 cmovc r11,r10
2728 sub rbp,r11
2729$L$pwrx_sp_done:
2730 and rbp,-64
2731 mov r11,rsp
2732 sub r11,rbp
2733 and r11,-4096
2734 lea rsp,[rbp*1+r11]
2735 mov r10,QWORD[rsp]
2736 cmp rsp,rbp
2737 ja NEAR $L$pwrx_page_walk
2738 jmp NEAR $L$pwrx_page_walk_done
2739
2740$L$pwrx_page_walk:
2741 lea rsp,[((-4096))+rsp]
2742 mov r10,QWORD[rsp]
2743 cmp rsp,rbp
2744 ja NEAR $L$pwrx_page_walk
2745$L$pwrx_page_walk_done:
2746
2747 mov r10,r9
2748 neg r9
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761 pxor xmm0,xmm0
2762DB 102,72,15,110,207
2763DB 102,72,15,110,209
2764DB 102,73,15,110,218
2765DB 102,72,15,110,226
2766 mov QWORD[32+rsp],r8
2767 mov QWORD[40+rsp],rax
2768
2769$L$powerx5_body:
2770
2771 call __bn_sqrx8x_internal
2772 call __bn_postx4x_internal
2773 call __bn_sqrx8x_internal
2774 call __bn_postx4x_internal
2775 call __bn_sqrx8x_internal
2776 call __bn_postx4x_internal
2777 call __bn_sqrx8x_internal
2778 call __bn_postx4x_internal
2779 call __bn_sqrx8x_internal
2780 call __bn_postx4x_internal
2781
2782 mov r9,r10
2783 mov rdi,rsi
2784DB 102,72,15,126,209
2785DB 102,72,15,126,226
2786 mov rax,QWORD[40+rsp]
2787
2788 call mulx4x_internal
2789
2790 mov rsi,QWORD[40+rsp]
2791
2792 mov rax,1
2793
2794 mov r15,QWORD[((-48))+rsi]
2795
2796 mov r14,QWORD[((-40))+rsi]
2797
2798 mov r13,QWORD[((-32))+rsi]
2799
2800 mov r12,QWORD[((-24))+rsi]
2801
2802 mov rbp,QWORD[((-16))+rsi]
2803
2804 mov rbx,QWORD[((-8))+rsi]
2805
2806 lea rsp,[rsi]
2807
2808$L$powerx5_epilogue:
2809 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
2810 mov rsi,QWORD[16+rsp]
2811 ret
2812
2813$L$SEH_end_bn_powerx5:
2814
2815global bn_sqrx8x_internal
2816
2817
2818ALIGN 32
2819bn_sqrx8x_internal:
2820__bn_sqrx8x_internal:
2821
2822_CET_ENDBR
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863 lea rdi,[((48+8))+rsp]
2864 lea rbp,[r9*1+rsi]
2865 mov QWORD[((0+8))+rsp],r9
2866 mov QWORD[((8+8))+rsp],rbp
2867 jmp NEAR $L$sqr8x_zero_start
2868
2869ALIGN 32
2870 DB 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2871$L$sqrx8x_zero:
2872 DB 0x3e
2873 movdqa XMMWORD[rdi],xmm0
2874 movdqa XMMWORD[16+rdi],xmm0
2875 movdqa XMMWORD[32+rdi],xmm0
2876 movdqa XMMWORD[48+rdi],xmm0
2877$L$sqr8x_zero_start:
2878 movdqa XMMWORD[64+rdi],xmm0
2879 movdqa XMMWORD[80+rdi],xmm0
2880 movdqa XMMWORD[96+rdi],xmm0
2881 movdqa XMMWORD[112+rdi],xmm0
2882 lea rdi,[128+rdi]
2883 sub r9,64
2884 jnz NEAR $L$sqrx8x_zero
2885
2886 mov rdx,QWORD[rsi]
2887
2888 xor r10,r10
2889 xor r11,r11
2890 xor r12,r12
2891 xor r13,r13
2892 xor r14,r14
2893 xor r15,r15
2894 lea rdi,[((48+8))+rsp]
2895 xor rbp,rbp
2896 jmp NEAR $L$sqrx8x_outer_loop
2897
2898ALIGN 32
2899$L$sqrx8x_outer_loop:
2900 mulx rax,r8,QWORD[8+rsi]
2901 adcx r8,r9
2902 adox r10,rax
2903 mulx rax,r9,QWORD[16+rsi]
2904 adcx r9,r10
2905 adox r11,rax
2906 DB 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2907 adcx r10,r11
2908 adox r12,rax
2909 DB 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2910 adcx r11,r12
2911 adox r13,rax
2912 mulx rax,r12,QWORD[40+rsi]
2913 adcx r12,r13
2914 adox r14,rax
2915 mulx rax,r13,QWORD[48+rsi]
2916 adcx r13,r14
2917 adox rax,r15
2918 mulx r15,r14,QWORD[56+rsi]
2919 mov rdx,QWORD[8+rsi]
2920 adcx r14,rax
2921 adox r15,rbp
2922 adc r15,QWORD[64+rdi]
2923 mov QWORD[8+rdi],r8
2924 mov QWORD[16+rdi],r9
2925 sbb rcx,rcx
2926 xor rbp,rbp
2927
2928
2929 mulx rbx,r8,QWORD[16+rsi]
2930 mulx rax,r9,QWORD[24+rsi]
2931 adcx r8,r10
2932 adox r9,rbx
2933 mulx rbx,r10,QWORD[32+rsi]
2934 adcx r9,r11
2935 adox r10,rax
2936 DB 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2937 adcx r10,r12
2938 adox r11,rbx
2939 DB 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2940 adcx r11,r13
2941 adox r12,r14
2942 DB 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2943 mov rdx,QWORD[16+rsi]
2944 adcx r12,rax
2945 adox r13,rbx
2946 adcx r13,r15
2947 adox r14,rbp
2948 adcx r14,rbp
2949
2950 mov QWORD[24+rdi],r8
2951 mov QWORD[32+rdi],r9
2952
2953 mulx rbx,r8,QWORD[24+rsi]
2954 mulx rax,r9,QWORD[32+rsi]
2955 adcx r8,r10
2956 adox r9,rbx
2957 mulx rbx,r10,QWORD[40+rsi]
2958 adcx r9,r11
2959 adox r10,rax
2960 DB 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2961 adcx r10,r12
2962 adox r11,r13
2963 DB 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2964 DB 0x3e
2965 mov rdx,QWORD[24+rsi]
2966 adcx r11,rbx
2967 adox r12,rax
2968 adcx r12,r14
2969 mov QWORD[40+rdi],r8
2970 mov QWORD[48+rdi],r9
2971 mulx rax,r8,QWORD[32+rsi]
2972 adox r13,rbp
2973 adcx r13,rbp
2974
2975 mulx rbx,r9,QWORD[40+rsi]
2976 adcx r8,r10
2977 adox r9,rax
2978 mulx rax,r10,QWORD[48+rsi]
2979 adcx r9,r11
2980 adox r10,r12
2981 mulx r12,r11,QWORD[56+rsi]
2982 mov rdx,QWORD[32+rsi]
2983 mov r14,QWORD[40+rsi]
2984 adcx r10,rbx
2985 adox r11,rax
2986 mov r15,QWORD[48+rsi]
2987 adcx r11,r13
2988 adox r12,rbp
2989 adcx r12,rbp
2990
2991 mov QWORD[56+rdi],r8
2992 mov QWORD[64+rdi],r9
2993
2994 mulx rax,r9,r14
2995 mov r8,QWORD[56+rsi]
2996 adcx r9,r10
2997 mulx rbx,r10,r15
2998 adox r10,rax
2999 adcx r10,r11
3000 mulx rax,r11,r8
3001 mov rdx,r14
3002 adox r11,rbx
3003 adcx r11,r12
3004
3005 adcx rax,rbp
3006
3007 mulx rbx,r14,r15
3008 mulx r13,r12,r8
3009 mov rdx,r15
3010 lea rsi,[64+rsi]
3011 adcx r11,r14
3012 adox r12,rbx
3013 adcx r12,rax
3014 adox r13,rbp
3015
3016 DB 0x67,0x67
3017 mulx r14,r8,r8
3018 adcx r13,r8
3019 adcx r14,rbp
3020
3021 cmp rsi,QWORD[((8+8))+rsp]
3022 je NEAR $L$sqrx8x_outer_break
3023
3024 neg rcx
3025 mov rcx,-8
3026 mov r15,rbp
3027 mov r8,QWORD[64+rdi]
3028 adcx r9,QWORD[72+rdi]
3029 adcx r10,QWORD[80+rdi]
3030 adcx r11,QWORD[88+rdi]
3031 adc r12,QWORD[96+rdi]
3032 adc r13,QWORD[104+rdi]
3033 adc r14,QWORD[112+rdi]
3034 adc r15,QWORD[120+rdi]
3035 lea rbp,[rsi]
3036 lea rdi,[128+rdi]
3037 sbb rax,rax
3038
3039 mov rdx,QWORD[((-64))+rsi]
3040 mov QWORD[((16+8))+rsp],rax
3041 mov QWORD[((24+8))+rsp],rdi
3042
3043
3044 xor eax,eax
3045 jmp NEAR $L$sqrx8x_loop
3046
3047ALIGN 32
3048$L$sqrx8x_loop:
3049 mov rbx,r8
3050 mulx r8,rax,QWORD[rbp]
3051 adcx rbx,rax
3052 adox r8,r9
3053
3054 mulx r9,rax,QWORD[8+rbp]
3055 adcx r8,rax
3056 adox r9,r10
3057
3058 mulx r10,rax,QWORD[16+rbp]
3059 adcx r9,rax
3060 adox r10,r11
3061
3062 mulx r11,rax,QWORD[24+rbp]
3063 adcx r10,rax
3064 adox r11,r12
3065
3066 DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3067 adcx r11,rax
3068 adox r12,r13
3069
3070 mulx r13,rax,QWORD[40+rbp]
3071 adcx r12,rax
3072 adox r13,r14
3073
3074 mulx r14,rax,QWORD[48+rbp]
3075 mov QWORD[rcx*8+rdi],rbx
3076 mov ebx,0
3077 adcx r13,rax
3078 adox r14,r15
3079
3080 DB 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3081 mov rdx,QWORD[8+rcx*8+rsi]
3082 adcx r14,rax
3083 adox r15,rbx
3084 adcx r15,rbx
3085
3086 DB 0x67
3087 inc rcx
3088 jnz NEAR $L$sqrx8x_loop
3089
3090 lea rbp,[64+rbp]
3091 mov rcx,-8
3092 cmp rbp,QWORD[((8+8))+rsp]
3093 je NEAR $L$sqrx8x_break
3094
3095 sub rbx,QWORD[((16+8))+rsp]
3096 DB 0x66
3097 mov rdx,QWORD[((-64))+rsi]
3098 adcx r8,QWORD[rdi]
3099 adcx r9,QWORD[8+rdi]
3100 adc r10,QWORD[16+rdi]
3101 adc r11,QWORD[24+rdi]
3102 adc r12,QWORD[32+rdi]
3103 adc r13,QWORD[40+rdi]
3104 adc r14,QWORD[48+rdi]
3105 adc r15,QWORD[56+rdi]
3106 lea rdi,[64+rdi]
3107 DB 0x67
3108 sbb rax,rax
3109 xor ebx,ebx
3110 mov QWORD[((16+8))+rsp],rax
3111 jmp NEAR $L$sqrx8x_loop
3112
3113ALIGN 32
3114$L$sqrx8x_break:
3115 xor rbp,rbp
3116 sub rbx,QWORD[((16+8))+rsp]
3117 adcx r8,rbp
3118 mov rcx,QWORD[((24+8))+rsp]
3119 adcx r9,rbp
3120 mov rdx,QWORD[rsi]
3121 adc r10,0
3122 mov QWORD[rdi],r8
3123 adc r11,0
3124 adc r12,0
3125 adc r13,0
3126 adc r14,0
3127 adc r15,0
3128 cmp rdi,rcx
3129 je NEAR $L$sqrx8x_outer_loop
3130
3131 mov QWORD[8+rdi],r9
3132 mov r9,QWORD[8+rcx]
3133 mov QWORD[16+rdi],r10
3134 mov r10,QWORD[16+rcx]
3135 mov QWORD[24+rdi],r11
3136 mov r11,QWORD[24+rcx]
3137 mov QWORD[32+rdi],r12
3138 mov r12,QWORD[32+rcx]
3139 mov QWORD[40+rdi],r13
3140 mov r13,QWORD[40+rcx]
3141 mov QWORD[48+rdi],r14
3142 mov r14,QWORD[48+rcx]
3143 mov QWORD[56+rdi],r15
3144 mov r15,QWORD[56+rcx]
3145 mov rdi,rcx
3146 jmp NEAR $L$sqrx8x_outer_loop
3147
3148ALIGN 32
3149$L$sqrx8x_outer_break:
3150 mov QWORD[72+rdi],r9
3151DB 102,72,15,126,217
3152 mov QWORD[80+rdi],r10
3153 mov QWORD[88+rdi],r11
3154 mov QWORD[96+rdi],r12
3155 mov QWORD[104+rdi],r13
3156 mov QWORD[112+rdi],r14
3157 lea rdi,[((48+8))+rsp]
3158 mov rdx,QWORD[rcx*1+rsi]
3159
3160 mov r11,QWORD[8+rdi]
3161 xor r10,r10
3162 mov r9,QWORD[((0+8))+rsp]
3163 adox r11,r11
3164 mov r12,QWORD[16+rdi]
3165 mov r13,QWORD[24+rdi]
3166
3167
3168ALIGN 32
3169$L$sqrx4x_shift_n_add:
3170 mulx rbx,rax,rdx
3171 adox r12,r12
3172 adcx rax,r10
3173 DB 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3174 DB 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3175 adox r13,r13
3176 adcx rbx,r11
3177 mov r11,QWORD[40+rdi]
3178 mov QWORD[rdi],rax
3179 mov QWORD[8+rdi],rbx
3180
3181 mulx rbx,rax,rdx
3182 adox r10,r10
3183 adcx rax,r12
3184 mov rdx,QWORD[16+rcx*1+rsi]
3185 mov r12,QWORD[48+rdi]
3186 adox r11,r11
3187 adcx rbx,r13
3188 mov r13,QWORD[56+rdi]
3189 mov QWORD[16+rdi],rax
3190 mov QWORD[24+rdi],rbx
3191
3192 mulx rbx,rax,rdx
3193 adox r12,r12
3194 adcx rax,r10
3195 mov rdx,QWORD[24+rcx*1+rsi]
3196 lea rcx,[32+rcx]
3197 mov r10,QWORD[64+rdi]
3198 adox r13,r13
3199 adcx rbx,r11
3200 mov r11,QWORD[72+rdi]
3201 mov QWORD[32+rdi],rax
3202 mov QWORD[40+rdi],rbx
3203
3204 mulx rbx,rax,rdx
3205 adox r10,r10
3206 adcx rax,r12
3207 jrcxz $L$sqrx4x_shift_n_add_break
3208 DB 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3209 adox r11,r11
3210 adcx rbx,r13
3211 mov r12,QWORD[80+rdi]
3212 mov r13,QWORD[88+rdi]
3213 mov QWORD[48+rdi],rax
3214 mov QWORD[56+rdi],rbx
3215 lea rdi,[64+rdi]
3216 nop
3217 jmp NEAR $L$sqrx4x_shift_n_add
3218
3219ALIGN 32
3220$L$sqrx4x_shift_n_add_break:
3221 adcx rbx,r13
3222 mov QWORD[48+rdi],rax
3223 mov QWORD[56+rdi],rbx
3224 lea rdi,[64+rdi]
3225DB 102,72,15,126,213
3226__bn_sqrx8x_reduction:
3227 xor eax,eax
3228 mov rbx,QWORD[((32+8))+rsp]
3229 mov rdx,QWORD[((48+8))+rsp]
3230 lea rcx,[((-64))+r9*1+rbp]
3231
3232 mov QWORD[((0+8))+rsp],rcx
3233 mov QWORD[((8+8))+rsp],rdi
3234
3235 lea rdi,[((48+8))+rsp]
3236 jmp NEAR $L$sqrx8x_reduction_loop
3237
3238ALIGN 32
3239$L$sqrx8x_reduction_loop:
3240 mov r9,QWORD[8+rdi]
3241 mov r10,QWORD[16+rdi]
3242 mov r11,QWORD[24+rdi]
3243 mov r12,QWORD[32+rdi]
3244 mov r8,rdx
3245 imul rdx,rbx
3246 mov r13,QWORD[40+rdi]
3247 mov r14,QWORD[48+rdi]
3248 mov r15,QWORD[56+rdi]
3249 mov QWORD[((24+8))+rsp],rax
3250
3251 lea rdi,[64+rdi]
3252 xor rsi,rsi
3253 mov rcx,-8
3254 jmp NEAR $L$sqrx8x_reduce
3255
3256ALIGN 32
3257$L$sqrx8x_reduce:
3258 mov rbx,r8
3259 mulx r8,rax,QWORD[rbp]
3260 adcx rax,rbx
3261 adox r8,r9
3262
3263 mulx r9,rbx,QWORD[8+rbp]
3264 adcx r8,rbx
3265 adox r9,r10
3266
3267 mulx r10,rbx,QWORD[16+rbp]
3268 adcx r9,rbx
3269 adox r10,r11
3270
3271 mulx r11,rbx,QWORD[24+rbp]
3272 adcx r10,rbx
3273 adox r11,r12
3274
3275 DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3276 mov rax,rdx
3277 mov rdx,r8
3278 adcx r11,rbx
3279 adox r12,r13
3280
3281 mulx rdx,rbx,QWORD[((32+8))+rsp]
3282 mov rdx,rax
3283 mov QWORD[((64+48+8))+rcx*8+rsp],rax
3284
3285 mulx r13,rax,QWORD[40+rbp]
3286 adcx r12,rax
3287 adox r13,r14
3288
3289 mulx r14,rax,QWORD[48+rbp]
3290 adcx r13,rax
3291 adox r14,r15
3292
3293 mulx r15,rax,QWORD[56+rbp]
3294 mov rdx,rbx
3295 adcx r14,rax
3296 adox r15,rsi
3297 adcx r15,rsi
3298
3299 DB 0x67,0x67,0x67
3300 inc rcx
3301 jnz NEAR $L$sqrx8x_reduce
3302
3303 mov rax,rsi
3304 cmp rbp,QWORD[((0+8))+rsp]
3305 jae NEAR $L$sqrx8x_no_tail
3306
3307 mov rdx,QWORD[((48+8))+rsp]
3308 add r8,QWORD[rdi]
3309 lea rbp,[64+rbp]
3310 mov rcx,-8
3311 adcx r9,QWORD[8+rdi]
3312 adcx r10,QWORD[16+rdi]
3313 adc r11,QWORD[24+rdi]
3314 adc r12,QWORD[32+rdi]
3315 adc r13,QWORD[40+rdi]
3316 adc r14,QWORD[48+rdi]
3317 adc r15,QWORD[56+rdi]
3318 lea rdi,[64+rdi]
3319 sbb rax,rax
3320
3321 xor rsi,rsi
3322 mov QWORD[((16+8))+rsp],rax
3323 jmp NEAR $L$sqrx8x_tail
3324
3325ALIGN 32
3326$L$sqrx8x_tail:
3327 mov rbx,r8
3328 mulx r8,rax,QWORD[rbp]
3329 adcx rbx,rax
3330 adox r8,r9
3331
3332 mulx r9,rax,QWORD[8+rbp]
3333 adcx r8,rax
3334 adox r9,r10
3335
3336 mulx r10,rax,QWORD[16+rbp]
3337 adcx r9,rax
3338 adox r10,r11
3339
3340 mulx r11,rax,QWORD[24+rbp]
3341 adcx r10,rax
3342 adox r11,r12
3343
3344 DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3345 adcx r11,rax
3346 adox r12,r13
3347
3348 mulx r13,rax,QWORD[40+rbp]
3349 adcx r12,rax
3350 adox r13,r14
3351
3352 mulx r14,rax,QWORD[48+rbp]
3353 adcx r13,rax
3354 adox r14,r15
3355
3356 mulx r15,rax,QWORD[56+rbp]
3357 mov rdx,QWORD[((72+48+8))+rcx*8+rsp]
3358 adcx r14,rax
3359 adox r15,rsi
3360 mov QWORD[rcx*8+rdi],rbx
3361 mov rbx,r8
3362 adcx r15,rsi
3363
3364 inc rcx
3365 jnz NEAR $L$sqrx8x_tail
3366
3367 cmp rbp,QWORD[((0+8))+rsp]
3368 jae NEAR $L$sqrx8x_tail_done
3369
3370 sub rsi,QWORD[((16+8))+rsp]
3371 mov rdx,QWORD[((48+8))+rsp]
3372 lea rbp,[64+rbp]
3373 adc r8,QWORD[rdi]
3374 adc r9,QWORD[8+rdi]
3375 adc r10,QWORD[16+rdi]
3376 adc r11,QWORD[24+rdi]
3377 adc r12,QWORD[32+rdi]
3378 adc r13,QWORD[40+rdi]
3379 adc r14,QWORD[48+rdi]
3380 adc r15,QWORD[56+rdi]
3381 lea rdi,[64+rdi]
3382 sbb rax,rax
3383 sub rcx,8
3384
3385 xor rsi,rsi
3386 mov QWORD[((16+8))+rsp],rax
3387 jmp NEAR $L$sqrx8x_tail
3388
3389ALIGN 32
3390$L$sqrx8x_tail_done:
3391 xor rax,rax
3392 add r8,QWORD[((24+8))+rsp]
3393 adc r9,0
3394 adc r10,0
3395 adc r11,0
3396 adc r12,0
3397 adc r13,0
3398 adc r14,0
3399 adc r15,0
3400 adc rax,0
3401
3402 sub rsi,QWORD[((16+8))+rsp]
3403$L$sqrx8x_no_tail:
3404 adc r8,QWORD[rdi]
3405DB 102,72,15,126,217
3406 adc r9,QWORD[8+rdi]
3407 mov rsi,QWORD[56+rbp]
3408DB 102,72,15,126,213
3409 adc r10,QWORD[16+rdi]
3410 adc r11,QWORD[24+rdi]
3411 adc r12,QWORD[32+rdi]
3412 adc r13,QWORD[40+rdi]
3413 adc r14,QWORD[48+rdi]
3414 adc r15,QWORD[56+rdi]
3415 adc rax,0
3416
3417 mov rbx,QWORD[((32+8))+rsp]
3418 mov rdx,QWORD[64+rcx*1+rdi]
3419
3420 mov QWORD[rdi],r8
3421 lea r8,[64+rdi]
3422 mov QWORD[8+rdi],r9
3423 mov QWORD[16+rdi],r10
3424 mov QWORD[24+rdi],r11
3425 mov QWORD[32+rdi],r12
3426 mov QWORD[40+rdi],r13
3427 mov QWORD[48+rdi],r14
3428 mov QWORD[56+rdi],r15
3429
3430 lea rdi,[64+rcx*1+rdi]
3431 cmp r8,QWORD[((8+8))+rsp]
3432 jb NEAR $L$sqrx8x_reduction_loop
3433 ret
3434
3435
3436ALIGN 32
3437
3438__bn_postx4x_internal:
3439
3440 mov r12,QWORD[rbp]
3441 mov r10,rcx
3442 mov r9,rcx
3443 neg rax
3444 sar rcx,3+2
3445
3446DB 102,72,15,126,202
3447DB 102,72,15,126,206
3448 dec r12
3449 mov r13,QWORD[8+rbp]
3450 xor r8,r8
3451 mov r14,QWORD[16+rbp]
3452 mov r15,QWORD[24+rbp]
3453 jmp NEAR $L$sqrx4x_sub_entry
3454
3455ALIGN 16
3456$L$sqrx4x_sub:
3457 mov r12,QWORD[rbp]
3458 mov r13,QWORD[8+rbp]
3459 mov r14,QWORD[16+rbp]
3460 mov r15,QWORD[24+rbp]
3461$L$sqrx4x_sub_entry:
3462 andn r12,r12,rax
3463 lea rbp,[32+rbp]
3464 andn r13,r13,rax
3465 andn r14,r14,rax
3466 andn r15,r15,rax
3467
3468 neg r8
3469 adc r12,QWORD[rdi]
3470 adc r13,QWORD[8+rdi]
3471 adc r14,QWORD[16+rdi]
3472 adc r15,QWORD[24+rdi]
3473 mov QWORD[rdx],r12
3474 lea rdi,[32+rdi]
3475 mov QWORD[8+rdx],r13
3476 sbb r8,r8
3477 mov QWORD[16+rdx],r14
3478 mov QWORD[24+rdx],r15
3479 lea rdx,[32+rdx]
3480
3481 inc rcx
3482 jnz NEAR $L$sqrx4x_sub
3483
3484 neg r9
3485
3486 ret
3487
3488
3489global bn_scatter5
3490
3491ALIGN 16
3492bn_scatter5:
3493
3494_CET_ENDBR
3495 cmp edx,0
3496 jz NEAR $L$scatter_epilogue
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506 lea r8,[r9*8+r8]
3507$L$scatter:
3508 mov rax,QWORD[rcx]
3509 lea rcx,[8+rcx]
3510 mov QWORD[r8],rax
3511 lea r8,[256+r8]
3512 sub edx,1
3513 jnz NEAR $L$scatter
3514$L$scatter_epilogue:
3515 ret
3516
3517
3518
3519global bn_gather5
3520
3521ALIGN 32
3522bn_gather5:
3523
3524$L$SEH_begin_bn_gather5:
3525_CET_ENDBR
3526
3527 DB 0x4c,0x8d,0x14,0x24
3528
3529 DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00
3530 lea rax,[$L$inc]
3531 and rsp,-16
3532
3533 movd xmm5,r9d
3534 movdqa xmm0,XMMWORD[rax]
3535 movdqa xmm1,XMMWORD[16+rax]
3536 lea r11,[128+r8]
3537 lea rax,[128+rsp]
3538
3539 pshufd xmm5,xmm5,0
3540 movdqa xmm4,xmm1
3541 movdqa xmm2,xmm1
3542 paddd xmm1,xmm0
3543 pcmpeqd xmm0,xmm5
3544 movdqa xmm3,xmm4
3545
3546 paddd xmm2,xmm1
3547 pcmpeqd xmm1,xmm5
3548 movdqa XMMWORD[(-128)+rax],xmm0
3549 movdqa xmm0,xmm4
3550
3551 paddd xmm3,xmm2
3552 pcmpeqd xmm2,xmm5
3553 movdqa XMMWORD[(-112)+rax],xmm1
3554 movdqa xmm1,xmm4
3555
3556 paddd xmm0,xmm3
3557 pcmpeqd xmm3,xmm5
3558 movdqa XMMWORD[(-96)+rax],xmm2
3559 movdqa xmm2,xmm4
3560 paddd xmm1,xmm0
3561 pcmpeqd xmm0,xmm5
3562 movdqa XMMWORD[(-80)+rax],xmm3
3563 movdqa xmm3,xmm4
3564
3565 paddd xmm2,xmm1
3566 pcmpeqd xmm1,xmm5
3567 movdqa XMMWORD[(-64)+rax],xmm0
3568 movdqa xmm0,xmm4
3569
3570 paddd xmm3,xmm2
3571 pcmpeqd xmm2,xmm5
3572 movdqa XMMWORD[(-48)+rax],xmm1
3573 movdqa xmm1,xmm4
3574
3575 paddd xmm0,xmm3
3576 pcmpeqd xmm3,xmm5
3577 movdqa XMMWORD[(-32)+rax],xmm2
3578 movdqa xmm2,xmm4
3579 paddd xmm1,xmm0
3580 pcmpeqd xmm0,xmm5
3581 movdqa XMMWORD[(-16)+rax],xmm3
3582 movdqa xmm3,xmm4
3583
3584 paddd xmm2,xmm1
3585 pcmpeqd xmm1,xmm5
3586 movdqa XMMWORD[rax],xmm0
3587 movdqa xmm0,xmm4
3588
3589 paddd xmm3,xmm2
3590 pcmpeqd xmm2,xmm5
3591 movdqa XMMWORD[16+rax],xmm1
3592 movdqa xmm1,xmm4
3593
3594 paddd xmm0,xmm3
3595 pcmpeqd xmm3,xmm5
3596 movdqa XMMWORD[32+rax],xmm2
3597 movdqa xmm2,xmm4
3598 paddd xmm1,xmm0
3599 pcmpeqd xmm0,xmm5
3600 movdqa XMMWORD[48+rax],xmm3
3601 movdqa xmm3,xmm4
3602
3603 paddd xmm2,xmm1
3604 pcmpeqd xmm1,xmm5
3605 movdqa XMMWORD[64+rax],xmm0
3606 movdqa xmm0,xmm4
3607
3608 paddd xmm3,xmm2
3609 pcmpeqd xmm2,xmm5
3610 movdqa XMMWORD[80+rax],xmm1
3611 movdqa xmm1,xmm4
3612
3613 paddd xmm0,xmm3
3614 pcmpeqd xmm3,xmm5
3615 movdqa XMMWORD[96+rax],xmm2
3616 movdqa xmm2,xmm4
3617 movdqa XMMWORD[112+rax],xmm3
3618 jmp NEAR $L$gather
3619
3620ALIGN 32
3621$L$gather:
3622 pxor xmm4,xmm4
3623 pxor xmm5,xmm5
3624 movdqa xmm0,XMMWORD[((-128))+r11]
3625 movdqa xmm1,XMMWORD[((-112))+r11]
3626 movdqa xmm2,XMMWORD[((-96))+r11]
3627 pand xmm0,XMMWORD[((-128))+rax]
3628 movdqa xmm3,XMMWORD[((-80))+r11]
3629 pand xmm1,XMMWORD[((-112))+rax]
3630 por xmm4,xmm0
3631 pand xmm2,XMMWORD[((-96))+rax]
3632 por xmm5,xmm1
3633 pand xmm3,XMMWORD[((-80))+rax]
3634 por xmm4,xmm2
3635 por xmm5,xmm3
3636 movdqa xmm0,XMMWORD[((-64))+r11]
3637 movdqa xmm1,XMMWORD[((-48))+r11]
3638 movdqa xmm2,XMMWORD[((-32))+r11]
3639 pand xmm0,XMMWORD[((-64))+rax]
3640 movdqa xmm3,XMMWORD[((-16))+r11]
3641 pand xmm1,XMMWORD[((-48))+rax]
3642 por xmm4,xmm0
3643 pand xmm2,XMMWORD[((-32))+rax]
3644 por xmm5,xmm1
3645 pand xmm3,XMMWORD[((-16))+rax]
3646 por xmm4,xmm2
3647 por xmm5,xmm3
3648 movdqa xmm0,XMMWORD[r11]
3649 movdqa xmm1,XMMWORD[16+r11]
3650 movdqa xmm2,XMMWORD[32+r11]
3651 pand xmm0,XMMWORD[rax]
3652 movdqa xmm3,XMMWORD[48+r11]
3653 pand xmm1,XMMWORD[16+rax]
3654 por xmm4,xmm0
3655 pand xmm2,XMMWORD[32+rax]
3656 por xmm5,xmm1
3657 pand xmm3,XMMWORD[48+rax]
3658 por xmm4,xmm2
3659 por xmm5,xmm3
3660 movdqa xmm0,XMMWORD[64+r11]
3661 movdqa xmm1,XMMWORD[80+r11]
3662 movdqa xmm2,XMMWORD[96+r11]
3663 pand xmm0,XMMWORD[64+rax]
3664 movdqa xmm3,XMMWORD[112+r11]
3665 pand xmm1,XMMWORD[80+rax]
3666 por xmm4,xmm0
3667 pand xmm2,XMMWORD[96+rax]
3668 por xmm5,xmm1
3669 pand xmm3,XMMWORD[112+rax]
3670 por xmm4,xmm2
3671 por xmm5,xmm3
3672 por xmm4,xmm5
3673 lea r11,[256+r11]
3674
3675 pshufd xmm0,xmm4,0x4e
3676 por xmm0,xmm4
3677 movq QWORD[rcx],xmm0
3678 lea rcx,[8+rcx]
3679 sub edx,1
3680 jnz NEAR $L$gather
3681
3682 lea rsp,[r10]
3683
3684 ret
3685$L$SEH_end_bn_gather5:
3686
3687
3688section .rdata rdata align=8
3689ALIGN 64
3690$L$inc:
3691 DD 0,0,1,1
3692 DD 2,2,2,2
3693 DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
3694 DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
3695 DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
3696 DB 114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
3697 DB 71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
3698 DB 112,101,110,115,115,108,46,111,114,103,62,0
3699section .text
3700
3701EXTERN __imp_RtlVirtualUnwind
3702
3703ALIGN 16
3704mul_handler:
3705 push rsi
3706 push rdi
3707 push rbx
3708 push rbp
3709 push r12
3710 push r13
3711 push r14
3712 push r15
3713 pushfq
3714 sub rsp,64
3715
3716 mov rax,QWORD[120+r8]
3717 mov rbx,QWORD[248+r8]
3718
3719 mov rsi,QWORD[8+r9]
3720 mov r11,QWORD[56+r9]
3721
3722 mov r10d,DWORD[r11]
3723 lea r10,[r10*1+rsi]
3724 cmp rbx,r10
3725 jb NEAR $L$common_seh_tail
3726
3727 mov r10d,DWORD[4+r11]
3728 lea r10,[r10*1+rsi]
3729 cmp rbx,r10
3730 jb NEAR $L$common_pop_regs
3731
3732 mov rax,QWORD[152+r8]
3733
3734 mov r10d,DWORD[8+r11]
3735 lea r10,[r10*1+rsi]
3736 cmp rbx,r10
3737 jae NEAR $L$common_seh_tail
3738
3739 lea r10,[$L$mul_epilogue]
3740 cmp rbx,r10
3741 ja NEAR $L$body_40
3742
3743 mov r10,QWORD[192+r8]
3744 mov rax,QWORD[8+r10*8+rax]
3745
3746 jmp NEAR $L$common_pop_regs
3747
3748$L$body_40:
3749 mov rax,QWORD[40+rax]
3750$L$common_pop_regs:
3751 mov rbx,QWORD[((-8))+rax]
3752 mov rbp,QWORD[((-16))+rax]
3753 mov r12,QWORD[((-24))+rax]
3754 mov r13,QWORD[((-32))+rax]
3755 mov r14,QWORD[((-40))+rax]
3756 mov r15,QWORD[((-48))+rax]
3757 mov QWORD[144+r8],rbx
3758 mov QWORD[160+r8],rbp
3759 mov QWORD[216+r8],r12
3760 mov QWORD[224+r8],r13
3761 mov QWORD[232+r8],r14
3762 mov QWORD[240+r8],r15
3763
3764$L$common_seh_tail:
3765 mov rdi,QWORD[8+rax]
3766 mov rsi,QWORD[16+rax]
3767 mov QWORD[152+r8],rax
3768 mov QWORD[168+r8],rsi
3769 mov QWORD[176+r8],rdi
3770
3771 mov rdi,QWORD[40+r9]
3772 mov rsi,r8
3773 mov ecx,154
3774 DD 0xa548f3fc
3775
3776 mov rsi,r9
3777 xor rcx,rcx
3778 mov rdx,QWORD[8+rsi]
3779 mov r8,QWORD[rsi]
3780 mov r9,QWORD[16+rsi]
3781 mov r10,QWORD[40+rsi]
3782 lea r11,[56+rsi]
3783 lea r12,[24+rsi]
3784 mov QWORD[32+rsp],r10
3785 mov QWORD[40+rsp],r11
3786 mov QWORD[48+rsp],r12
3787 mov QWORD[56+rsp],rcx
3788 call QWORD[__imp_RtlVirtualUnwind]
3789
3790 mov eax,1
3791 add rsp,64
3792 popfq
3793 pop r15
3794 pop r14
3795 pop r13
3796 pop r12
3797 pop rbp
3798 pop rbx
3799 pop rdi
3800 pop rsi
3801 ret
3802
3803
3804section .pdata rdata align=4
3805ALIGN 4
David Benjamin3efe2eb2024-05-08 22:24:27 -07003806 DD $L$SEH_begin_bn_mul_mont_gather5_nohw wrt ..imagebase
3807 DD $L$SEH_end_bn_mul_mont_gather5_nohw wrt ..imagebase
3808 DD $L$SEH_info_bn_mul_mont_gather5_nohw wrt ..imagebase
David Benjaminfe0c91e2024-03-18 15:37:24 +10003809
3810 DD $L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
3811 DD $L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
3812 DD $L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
3813
David Benjamin3efe2eb2024-05-08 22:24:27 -07003814 DD $L$SEH_begin_bn_power5_nohw wrt ..imagebase
3815 DD $L$SEH_end_bn_power5_nohw wrt ..imagebase
3816 DD $L$SEH_info_bn_power5_nohw wrt ..imagebase
David Benjaminfe0c91e2024-03-18 15:37:24 +10003817 DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
3818 DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
3819 DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
3820
3821 DD $L$SEH_begin_bn_powerx5 wrt ..imagebase
3822 DD $L$SEH_end_bn_powerx5 wrt ..imagebase
3823 DD $L$SEH_info_bn_powerx5 wrt ..imagebase
3824 DD $L$SEH_begin_bn_gather5 wrt ..imagebase
3825 DD $L$SEH_end_bn_gather5 wrt ..imagebase
3826 DD $L$SEH_info_bn_gather5 wrt ..imagebase
3827
3828section .xdata rdata align=8
3829ALIGN 8
David Benjamin3efe2eb2024-05-08 22:24:27 -07003830$L$SEH_info_bn_mul_mont_gather5_nohw:
David Benjaminfe0c91e2024-03-18 15:37:24 +10003831 DB 9,0,0,0
3832 DD mul_handler wrt ..imagebase
3833 DD $L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
3834ALIGN 8
3835$L$SEH_info_bn_mul4x_mont_gather5:
3836 DB 9,0,0,0
3837 DD mul_handler wrt ..imagebase
3838 DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
3839ALIGN 8
David Benjamin3efe2eb2024-05-08 22:24:27 -07003840$L$SEH_info_bn_power5_nohw:
David Benjaminfe0c91e2024-03-18 15:37:24 +10003841 DB 9,0,0,0
3842 DD mul_handler wrt ..imagebase
3843 DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
3844ALIGN 8
3845$L$SEH_info_bn_mulx4x_mont_gather5:
3846 DB 9,0,0,0
3847 DD mul_handler wrt ..imagebase
3848 DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
3849ALIGN 8
3850$L$SEH_info_bn_powerx5:
3851 DB 9,0,0,0
3852 DD mul_handler wrt ..imagebase
3853 DD $L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase
3854ALIGN 8
3855$L$SEH_info_bn_gather5:
3856 DB 0x01,0x0b,0x03,0x0a
3857 DB 0x0b,0x01,0x21,0x00
3858 DB 0x04,0xa3,0x00,0x00
3859ALIGN 8
3860%else
3861; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
3862ret
3863%endif